• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
6
7//go:build gc && !purego
8
9#include "textflag.h"
10// General register allocation
11#define oup DI
12#define inp SI
13#define inl BX
14#define adp CX // free to reuse, after we hash the additional data
15#define keyp R8 // free to reuse, when we copy the key to stack
16#define itr2 R9 // general iterator
17#define itr1 CX // general iterator
18#define acc0 R10
19#define acc1 R11
20#define acc2 R12
21#define t0 R13
22#define t1 R14
23#define t2 R15
24#define t3 R8
25// Register and stack allocation for the SSE code
26#define rStore (0*16)(BP)
27#define sStore (1*16)(BP)
28#define state1Store (2*16)(BP)
29#define state2Store (3*16)(BP)
30#define tmpStore (4*16)(BP)
31#define ctr0Store (5*16)(BP)
32#define ctr1Store (6*16)(BP)
33#define ctr2Store (7*16)(BP)
34#define ctr3Store (8*16)(BP)
35#define A0 X0
36#define A1 X1
37#define A2 X2
38#define B0 X3
39#define B1 X4
40#define B2 X5
41#define C0 X6
42#define C1 X7
43#define C2 X8
44#define D0 X9
45#define D1 X10
46#define D2 X11
47#define T0 X12
48#define T1 X13
49#define T2 X14
50#define T3 X15
51#define A3 T0
52#define B3 T1
53#define C3 T2
54#define D3 T3
55// Register and stack allocation for the AVX2 code
56#define rsStoreAVX2 (0*32)(BP)
57#define state1StoreAVX2 (1*32)(BP)
58#define state2StoreAVX2 (2*32)(BP)
59#define ctr0StoreAVX2 (3*32)(BP)
60#define ctr1StoreAVX2 (4*32)(BP)
61#define ctr2StoreAVX2 (5*32)(BP)
62#define ctr3StoreAVX2 (6*32)(BP)
63#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
64#define AA0 Y0
65#define AA1 Y5
66#define AA2 Y6
67#define AA3 Y7
68#define BB0 Y14
69#define BB1 Y9
70#define BB2 Y10
71#define BB3 Y11
72#define CC0 Y12
73#define CC1 Y13
74#define CC2 Y8
75#define CC3 Y15
76#define DD0 Y4
77#define DD1 Y1
78#define DD2 Y2
79#define DD3 Y3
80#define TT0 DD3
81#define TT1 AA3
82#define TT2 BB3
83#define TT3 CC3
84// ChaCha20 constants
85DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
86DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
87DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
88DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
89DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
90DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
91DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
92DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
93// <<< 16 with PSHUFB
94DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
95DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
96DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
97DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
98// <<< 8 with PSHUFB
99DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
100DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
101DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
102DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
103
104DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
105DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
106DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
107DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
108
109DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
110DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
111DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
112DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
113// Poly1305 key clamp
114DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
115DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
116DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
117DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
118
119DATA ·sseIncMask<>+0x00(SB)/8, $0x1
120DATA ·sseIncMask<>+0x08(SB)/8, $0x0
121// To load/store the last < 16 bytes in a buffer
122DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
123DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
124DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
125DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
126DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
127DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
128DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
129DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
130DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
131DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
132DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
133DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
134DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
135DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
136DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
137DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
138DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
139DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
140DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
141DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
142DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
143DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
144DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
145DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
146DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
147DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
148DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
149DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
150DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
151DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
152
153GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
154GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
155GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
156GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
157GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
158GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
159GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
160GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
161// No PALIGNR in Go ASM yet (but VPALIGNR is present).
162#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
163#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
164#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
165#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
166#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
167#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
168#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
169#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
170#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
171#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
172#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
173#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
174#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
175#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
176#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
177#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
178#define shiftC0Right shiftC0Left
179#define shiftC1Right shiftC1Left
180#define shiftC2Right shiftC2Left
181#define shiftC3Right shiftC3Left
182#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
183#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
184#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
185#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
186
187// Some macros
188
189// ROL rotates the uint32s in register R left by N bits, using temporary T.
190#define ROL(N, R, T) \
191	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
192
193// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
194#ifdef GOAMD64_v2
195#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
196#else
197#define ROL16(R, T) ROL(16, R, T)
198#endif
199
200// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
201#ifdef GOAMD64_v2
202#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
203#else
204#define ROL8(R, T) ROL(8, R, T)
205#endif
206
207#define chachaQR(A, B, C, D, T) \
208	PADDD B, A; PXOR A, D; ROL16(D, T) \
209	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
210	PADDD B, A; PXOR A, D; ROL8(D, T) \
211	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
212
213#define chachaQR_AVX2(A, B, C, D, T) \
214	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
215	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
216	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
217	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
218
219#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
220#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
221#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
222#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
223#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
224
225#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
226#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
227#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
228
229#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
230#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
231// ----------------------------------------------------------------------------
232TEXT polyHashADInternal<>(SB), NOSPLIT, $0
233	// adp points to beginning of additional data
234	// itr2 holds ad length
235	XORQ acc0, acc0
236	XORQ acc1, acc1
237	XORQ acc2, acc2
238	CMPQ itr2, $13
239	JNE  hashADLoop
240
241openFastTLSAD:
242	// Special treatment for the TLS case of 13 bytes
243	MOVQ (adp), acc0
244	MOVQ 5(adp), acc1
245	SHRQ $24, acc1
246	MOVQ $1, acc2
247	polyMul
248	RET
249
250hashADLoop:
251	// Hash in 16 byte chunks
252	CMPQ itr2, $16
253	JB   hashADTail
254	polyAdd(0(adp))
255	LEAQ (1*16)(adp), adp
256	SUBQ $16, itr2
257	polyMul
258	JMP  hashADLoop
259
260hashADTail:
261	CMPQ itr2, $0
262	JE   hashADDone
263
264	// Hash last < 16 byte tail
265	XORQ t0, t0
266	XORQ t1, t1
267	XORQ t2, t2
268	ADDQ itr2, adp
269
270hashADTailLoop:
271	SHLQ $8, t0, t1
272	SHLQ $8, t0
273	MOVB -1(adp), t2
274	XORQ t2, t0
275	DECQ adp
276	DECQ itr2
277	JNE  hashADTailLoop
278
279hashADTailFinish:
280	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
281	polyMul
282
283	// Finished AD
284hashADDone:
285	RET
286
287// ----------------------------------------------------------------------------
288// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
289TEXT ·chacha20Poly1305Open(SB), 0, $288-97
290	// For aligned stack access
291	MOVQ SP, BP
292	ADDQ $32, BP
293	ANDQ $-32, BP
294	MOVQ dst+0(FP), oup
295	MOVQ key+24(FP), keyp
296	MOVQ src+48(FP), inp
297	MOVQ src_len+56(FP), inl
298	MOVQ ad+72(FP), adp
299
300	// Check for AVX2 support
301	CMPB ·useAVX2(SB), $1
302	JE   chacha20Poly1305Open_AVX2
303
304	// Special optimization, for very short buffers
305	CMPQ inl, $128
306	JBE  openSSE128 // About 16% faster
307
308	// For long buffers, prepare the poly key first
309	MOVOU ·chacha20Constants<>(SB), A0
310	MOVOU (1*16)(keyp), B0
311	MOVOU (2*16)(keyp), C0
312	MOVOU (3*16)(keyp), D0
313	MOVO  D0, T1
314
315	// Store state on stack for future use
316	MOVO B0, state1Store
317	MOVO C0, state2Store
318	MOVO D0, ctr3Store
319	MOVQ $10, itr2
320
321openSSEPreparePolyKey:
322	chachaQR(A0, B0, C0, D0, T0)
323	shiftB0Left;  shiftC0Left; shiftD0Left
324	chachaQR(A0, B0, C0, D0, T0)
325	shiftB0Right; shiftC0Right; shiftD0Right
326	DECQ          itr2
327	JNE           openSSEPreparePolyKey
328
329	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
330	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
331
332	// Clamp and store the key
333	PAND ·polyClampMask<>(SB), A0
334	MOVO A0, rStore; MOVO B0, sStore
335
336	// Hash AAD
337	MOVQ ad_len+80(FP), itr2
338	CALL polyHashADInternal<>(SB)
339
340openSSEMainLoop:
341	CMPQ inl, $256
342	JB   openSSEMainLoopDone
343
344	// Load state, increment counter blocks
345	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
346	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
347	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
348	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
349
350	// Store counters
351	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
352
353	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
354	MOVQ $4, itr1
355	MOVQ inp, itr2
356
357openSSEInternalLoop:
358	MOVO          C3, tmpStore
359	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
360	MOVO          tmpStore, C3
361	MOVO          C1, tmpStore
362	chachaQR(A3, B3, C3, D3, C1)
363	MOVO          tmpStore, C1
364	polyAdd(0(itr2))
365	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
366	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
367	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
368	polyMulStage1
369	polyMulStage2
370	LEAQ          (2*8)(itr2), itr2
371	MOVO          C3, tmpStore
372	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
373	MOVO          tmpStore, C3
374	MOVO          C1, tmpStore
375	polyMulStage3
376	chachaQR(A3, B3, C3, D3, C1)
377	MOVO          tmpStore, C1
378	polyMulReduceStage
379	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
380	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
381	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
382	DECQ          itr1
383	JGE           openSSEInternalLoop
384
385	polyAdd(0(itr2))
386	polyMul
387	LEAQ (2*8)(itr2), itr2
388
389	CMPQ itr1, $-6
390	JG   openSSEInternalLoop
391
392	// Add in the state
393	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
394	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
395	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
396	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
397
398	// Load - xor - store
399	MOVO  D3, tmpStore
400	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
401	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
402	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
403	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
404	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
405	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
406	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
407	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
408	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
409	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
410	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
411	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
412	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
413	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
414	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
415	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
416	LEAQ  256(inp), inp
417	LEAQ  256(oup), oup
418	SUBQ  $256, inl
419	JMP   openSSEMainLoop
420
421openSSEMainLoopDone:
422	// Handle the various tail sizes efficiently
423	TESTQ inl, inl
424	JE    openSSEFinalize
425	CMPQ  inl, $64
426	JBE   openSSETail64
427	CMPQ  inl, $128
428	JBE   openSSETail128
429	CMPQ  inl, $192
430	JBE   openSSETail192
431	JMP   openSSETail256
432
433openSSEFinalize:
434	// Hash in the PT, AAD lengths
435	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
436	polyMul
437
438	// Final reduce
439	MOVQ    acc0, t0
440	MOVQ    acc1, t1
441	MOVQ    acc2, t2
442	SUBQ    $-5, acc0
443	SBBQ    $-1, acc1
444	SBBQ    $3, acc2
445	CMOVQCS t0, acc0
446	CMOVQCS t1, acc1
447	CMOVQCS t2, acc2
448
449	// Add in the "s" part of the key
450	ADDQ 0+sStore, acc0
451	ADCQ 8+sStore, acc1
452
453	// Finally, constant time compare to the tag at the end of the message
454	XORQ    AX, AX
455	MOVQ    $1, DX
456	XORQ    (0*8)(inp), acc0
457	XORQ    (1*8)(inp), acc1
458	ORQ     acc1, acc0
459	CMOVQEQ DX, AX
460
461	// Return true iff tags are equal
462	MOVB AX, ret+96(FP)
463	RET
464
465// ----------------------------------------------------------------------------
466// Special optimization for buffers smaller than 129 bytes
467openSSE128:
468	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
469	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
470	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
471	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
472	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
473	MOVQ  $10, itr2
474
475openSSE128InnerCipherLoop:
476	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
477	shiftB0Left;  shiftB1Left; shiftB2Left
478	shiftC0Left;  shiftC1Left; shiftC2Left
479	shiftD0Left;  shiftD1Left; shiftD2Left
480	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
481	shiftB0Right; shiftB1Right; shiftB2Right
482	shiftC0Right; shiftC1Right; shiftC2Right
483	shiftD0Right; shiftD1Right; shiftD2Right
484	DECQ          itr2
485	JNE           openSSE128InnerCipherLoop
486
487	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
488	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
489	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
490	PADDL T2, C1; PADDL T2, C2
491	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
492
493	// Clamp and store the key
494	PAND  ·polyClampMask<>(SB), A0
495	MOVOU A0, rStore; MOVOU B0, sStore
496
497	// Hash
498	MOVQ ad_len+80(FP), itr2
499	CALL polyHashADInternal<>(SB)
500
501openSSE128Open:
502	CMPQ inl, $16
503	JB   openSSETail16
504	SUBQ $16, inl
505
506	// Load for hashing
507	polyAdd(0(inp))
508
509	// Load for decryption
510	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
511	LEAQ  (1*16)(inp), inp
512	LEAQ  (1*16)(oup), oup
513	polyMul
514
515	// Shift the stream "left"
516	MOVO B1, A1
517	MOVO C1, B1
518	MOVO D1, C1
519	MOVO A2, D1
520	MOVO B2, A2
521	MOVO C2, B2
522	MOVO D2, C2
523	JMP  openSSE128Open
524
525openSSETail16:
526	TESTQ inl, inl
527	JE    openSSEFinalize
528
529	// We can safely load the CT from the end, because it is padded with the MAC
530	MOVQ   inl, itr2
531	SHLQ   $4, itr2
532	LEAQ   ·andMask<>(SB), t0
533	MOVOU  (inp), T0
534	ADDQ   inl, inp
535	PAND   -16(t0)(itr2*1), T0
536	MOVO   T0, 0+tmpStore
537	MOVQ   T0, t0
538	MOVQ   8+tmpStore, t1
539	PXOR   A1, T0
540
541	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
542openSSETail16Store:
543	MOVQ T0, t3
544	MOVB t3, (oup)
545	PSRLDQ $1, T0
546	INCQ   oup
547	DECQ   inl
548	JNE    openSSETail16Store
549	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
550	polyMul
551	JMP    openSSEFinalize
552
553// ----------------------------------------------------------------------------
554// Special optimization for the last 64 bytes of ciphertext
555openSSETail64:
556	// Need to decrypt up to 64 bytes - prepare single block
557	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
558	XORQ itr2, itr2
559	MOVQ inl, itr1
560	CMPQ itr1, $16
561	JB   openSSETail64LoopB
562
563openSSETail64LoopA:
564	// Perform ChaCha rounds, while hashing the remaining input
565	polyAdd(0(inp)(itr2*1))
566	polyMul
567	SUBQ $16, itr1
568
569openSSETail64LoopB:
570	ADDQ          $16, itr2
571	chachaQR(A0, B0, C0, D0, T0)
572	shiftB0Left;  shiftC0Left; shiftD0Left
573	chachaQR(A0, B0, C0, D0, T0)
574	shiftB0Right; shiftC0Right; shiftD0Right
575
576	CMPQ itr1, $16
577	JAE  openSSETail64LoopA
578
579	CMPQ itr2, $160
580	JNE  openSSETail64LoopB
581
582	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
583
584openSSETail64DecLoop:
585	CMPQ  inl, $16
586	JB    openSSETail64DecLoopDone
587	SUBQ  $16, inl
588	MOVOU (inp), T0
589	PXOR  T0, A0
590	MOVOU A0, (oup)
591	LEAQ  16(inp), inp
592	LEAQ  16(oup), oup
593	MOVO  B0, A0
594	MOVO  C0, B0
595	MOVO  D0, C0
596	JMP   openSSETail64DecLoop
597
598openSSETail64DecLoopDone:
599	MOVO A0, A1
600	JMP  openSSETail16
601
602// ----------------------------------------------------------------------------
603// Special optimization for the last 128 bytes of ciphertext
604openSSETail128:
605	// Need to decrypt up to 128 bytes - prepare two blocks
606	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
607	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
608	XORQ itr2, itr2
609	MOVQ inl, itr1
610	ANDQ $-16, itr1
611
612openSSETail128LoopA:
613	// Perform ChaCha rounds, while hashing the remaining input
614	polyAdd(0(inp)(itr2*1))
615	polyMul
616
617openSSETail128LoopB:
618	ADDQ          $16, itr2
619	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
620	shiftB0Left;  shiftC0Left; shiftD0Left
621	shiftB1Left;  shiftC1Left; shiftD1Left
622	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
623	shiftB0Right; shiftC0Right; shiftD0Right
624	shiftB1Right; shiftC1Right; shiftD1Right
625
626	CMPQ itr2, itr1
627	JB   openSSETail128LoopA
628
629	CMPQ itr2, $160
630	JNE  openSSETail128LoopB
631
632	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
633	PADDL state1Store, B0; PADDL state1Store, B1
634	PADDL state2Store, C0; PADDL state2Store, C1
635	PADDL ctr1Store, D0; PADDL ctr0Store, D1
636
637	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
638	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
639	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
640
641	SUBQ $64, inl
642	LEAQ 64(inp), inp
643	LEAQ 64(oup), oup
644	JMP  openSSETail64DecLoop
645
646// ----------------------------------------------------------------------------
647// Special optimization for the last 192 bytes of ciphertext
648openSSETail192:
649	// Need to decrypt up to 192 bytes - prepare three blocks
650	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
651	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
652	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
653
654	MOVQ    inl, itr1
655	MOVQ    $160, itr2
656	CMPQ    itr1, $160
657	CMOVQGT itr2, itr1
658	ANDQ    $-16, itr1
659	XORQ    itr2, itr2
660
661openSSLTail192LoopA:
662	// Perform ChaCha rounds, while hashing the remaining input
663	polyAdd(0(inp)(itr2*1))
664	polyMul
665
666openSSLTail192LoopB:
667	ADDQ         $16, itr2
668	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
669	shiftB0Left; shiftC0Left; shiftD0Left
670	shiftB1Left; shiftC1Left; shiftD1Left
671	shiftB2Left; shiftC2Left; shiftD2Left
672
673	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
674	shiftB0Right; shiftC0Right; shiftD0Right
675	shiftB1Right; shiftC1Right; shiftD1Right
676	shiftB2Right; shiftC2Right; shiftD2Right
677
678	CMPQ itr2, itr1
679	JB   openSSLTail192LoopA
680
681	CMPQ itr2, $160
682	JNE  openSSLTail192LoopB
683
684	CMPQ inl, $176
685	JB   openSSLTail192Store
686
687	polyAdd(160(inp))
688	polyMul
689
690	CMPQ inl, $192
691	JB   openSSLTail192Store
692
693	polyAdd(176(inp))
694	polyMul
695
696openSSLTail192Store:
697	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
698	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
699	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
700	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
701
702	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
703	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
704	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
705
706	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
707	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
708	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
709
710	SUBQ $128, inl
711	LEAQ 128(inp), inp
712	LEAQ 128(oup), oup
713	JMP  openSSETail64DecLoop
714
715// ----------------------------------------------------------------------------
716// Special optimization for the last 256 bytes of ciphertext
717openSSETail256:
718	// Need to decrypt up to 256 bytes - prepare four blocks
719	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
720	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
721	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
722	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
723
724	// Store counters
725	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
726	XORQ itr2, itr2
727
728openSSETail256Loop:
729	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
730	polyAdd(0(inp)(itr2*1))
731	MOVO          C3, tmpStore
732	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
733	MOVO          tmpStore, C3
734	MOVO          C1, tmpStore
735	chachaQR(A3, B3, C3, D3, C1)
736	MOVO          tmpStore, C1
737	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
738	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
739	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
740	polyMulStage1
741	polyMulStage2
742	MOVO          C3, tmpStore
743	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
744	MOVO          tmpStore, C3
745	MOVO          C1, tmpStore
746	chachaQR(A3, B3, C3, D3, C1)
747	MOVO          tmpStore, C1
748	polyMulStage3
749	polyMulReduceStage
750	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
751	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
752	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
753	ADDQ          $2*8, itr2
754	CMPQ          itr2, $160
755	JB            openSSETail256Loop
756	MOVQ          inl, itr1
757	ANDQ          $-16, itr1
758
759openSSETail256HashLoop:
760	polyAdd(0(inp)(itr2*1))
761	polyMul
762	ADDQ $2*8, itr2
763	CMPQ itr2, itr1
764	JB   openSSETail256HashLoop
765
766	// Add in the state
767	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
768	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
769	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
770	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
771	MOVO  D3, tmpStore
772
773	// Load - xor - store
774	MOVOU (0*16)(inp), D3; PXOR D3, A0
775	MOVOU (1*16)(inp), D3; PXOR D3, B0
776	MOVOU (2*16)(inp), D3; PXOR D3, C0
777	MOVOU (3*16)(inp), D3; PXOR D3, D0
778	MOVOU A0, (0*16)(oup)
779	MOVOU B0, (1*16)(oup)
780	MOVOU C0, (2*16)(oup)
781	MOVOU D0, (3*16)(oup)
782	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
783	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
784	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
785	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
786	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
787	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
788	LEAQ  192(inp), inp
789	LEAQ  192(oup), oup
790	SUBQ  $192, inl
791	MOVO  A3, A0
792	MOVO  B3, B0
793	MOVO  C3, C0
794	MOVO  tmpStore, D0
795
796	JMP openSSETail64DecLoop
797
798// ----------------------------------------------------------------------------
799// ------------------------- AVX2 Code ----------------------------------------
800chacha20Poly1305Open_AVX2:
801	VZEROUPPER
802	VMOVDQU ·chacha20Constants<>(SB), AA0
803	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
804	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
805	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
806	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
807
808	// Special optimization, for very short buffers
809	CMPQ inl, $192
810	JBE  openAVX2192
811	CMPQ inl, $320
812	JBE  openAVX2320
813
814	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
815	VMOVDQA BB0, state1StoreAVX2
816	VMOVDQA CC0, state2StoreAVX2
817	VMOVDQA DD0, ctr3StoreAVX2
818	MOVQ    $10, itr2
819
820openAVX2PreparePolyKey:
821	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
822	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
823	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
824	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
825	DECQ     itr2
826	JNE      openAVX2PreparePolyKey
827
828	VPADDD ·chacha20Constants<>(SB), AA0, AA0
829	VPADDD state1StoreAVX2, BB0, BB0
830	VPADDD state2StoreAVX2, CC0, CC0
831	VPADDD ctr3StoreAVX2, DD0, DD0
832
833	VPERM2I128 $0x02, AA0, BB0, TT0
834
835	// Clamp and store poly key
836	VPAND   ·polyClampMask<>(SB), TT0, TT0
837	VMOVDQA TT0, rsStoreAVX2
838
839	// Stream for the first 64 bytes
840	VPERM2I128 $0x13, AA0, BB0, AA0
841	VPERM2I128 $0x13, CC0, DD0, BB0
842
843	// Hash AD + first 64 bytes
844	MOVQ ad_len+80(FP), itr2
845	CALL polyHashADInternal<>(SB)
846	XORQ itr1, itr1
847
848openAVX2InitialHash64:
849	polyAdd(0(inp)(itr1*1))
850	polyMulAVX2
851	ADDQ $16, itr1
852	CMPQ itr1, $64
853	JNE  openAVX2InitialHash64
854
855	// Decrypt the first 64 bytes
856	VPXOR   (0*32)(inp), AA0, AA0
857	VPXOR   (1*32)(inp), BB0, BB0
858	VMOVDQU AA0, (0*32)(oup)
859	VMOVDQU BB0, (1*32)(oup)
860	LEAQ    (2*32)(inp), inp
861	LEAQ    (2*32)(oup), oup
862	SUBQ    $64, inl
863
864openAVX2MainLoop:
865	CMPQ inl, $512
866	JB   openAVX2MainLoopDone
867
868	// Load state, increment counter blocks, store the incremented counters
869	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
870	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
871	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
872	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
873	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
874	XORQ    itr1, itr1
875
876openAVX2InternalLoop:
877	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
878	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
879	polyAdd(0*8(inp)(itr1*1))
880	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
881	polyMulStage1_AVX2
882	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
883	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
884	polyMulStage2_AVX2
885	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
886	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
887	polyMulStage3_AVX2
888	VMOVDQA  CC3, tmpStoreAVX2
889	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
890	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
891	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
892	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
893	VMOVDQA  tmpStoreAVX2, CC3
894	polyMulReduceStage
895	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
896	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
897	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
898	polyAdd(2*8(inp)(itr1*1))
899	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
900	polyMulStage1_AVX2
901	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
902	VMOVDQA  CC3, tmpStoreAVX2
903	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
904	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
905	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
906	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
907	VMOVDQA  tmpStoreAVX2, CC3
908	polyMulStage2_AVX2
909	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
910	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
911	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
912	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
913	polyMulStage3_AVX2
914	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
915	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
916	polyMulReduceStage
917	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
918	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
919	polyAdd(4*8(inp)(itr1*1))
920	LEAQ     (6*8)(itr1), itr1
921	VMOVDQA  CC3, tmpStoreAVX2
922	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
923	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
924	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
925	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
926	VMOVDQA  tmpStoreAVX2, CC3
927	polyMulStage1_AVX2
928	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
929	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
930	polyMulStage2_AVX2
931	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
932	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
933	polyMulStage3_AVX2
934	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
935	VMOVDQA  CC3, tmpStoreAVX2
936	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
937	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
938	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
939	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
940	VMOVDQA  tmpStoreAVX2, CC3
941	polyMulReduceStage
942	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
943	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
944	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
945	CMPQ     itr1, $480
946	JNE      openAVX2InternalLoop
947
948	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
949	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
950	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
951	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
952	VMOVDQA CC3, tmpStoreAVX2
953
954	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
955	polyAdd(480(inp))
956	polyMulAVX2
957	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
958	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
959	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
960	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
961	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
962	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
963
964	// and here
965	polyAdd(496(inp))
966	polyMulAVX2
967	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
968	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
969	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
970	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
971	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
972	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
973	LEAQ       (32*16)(inp), inp
974	LEAQ       (32*16)(oup), oup
975	SUBQ       $(32*16), inl
976	JMP        openAVX2MainLoop
977
978openAVX2MainLoopDone:
979	// Handle the various tail sizes efficiently
980	TESTQ inl, inl
981	JE    openSSEFinalize
982	CMPQ  inl, $128
983	JBE   openAVX2Tail128
984	CMPQ  inl, $256
985	JBE   openAVX2Tail256
986	CMPQ  inl, $384
987	JBE   openAVX2Tail384
988	JMP   openAVX2Tail512
989
990// ----------------------------------------------------------------------------
991// Special optimization for buffers smaller than 193 bytes
992openAVX2192:
993	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
994	VMOVDQA AA0, AA1
995	VMOVDQA BB0, BB1
996	VMOVDQA CC0, CC1
997	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
998	VMOVDQA AA0, AA2
999	VMOVDQA BB0, BB2
1000	VMOVDQA CC0, CC2
1001	VMOVDQA DD0, DD2
1002	VMOVDQA DD1, TT3
1003	MOVQ    $10, itr2
1004
1005openAVX2192InnerCipherLoop:
1006	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1007	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1008	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1009	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1010	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1011	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1012	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1013	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1014	DECQ       itr2
1015	JNE        openAVX2192InnerCipherLoop
1016	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
1017	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
1018	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
1019	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1020	VPERM2I128 $0x02, AA0, BB0, TT0
1021
1022	// Clamp and store poly key
1023	VPAND   ·polyClampMask<>(SB), TT0, TT0
1024	VMOVDQA TT0, rsStoreAVX2
1025
1026	// Stream for up to 192 bytes
1027	VPERM2I128 $0x13, AA0, BB0, AA0
1028	VPERM2I128 $0x13, CC0, DD0, BB0
1029	VPERM2I128 $0x02, AA1, BB1, CC0
1030	VPERM2I128 $0x02, CC1, DD1, DD0
1031	VPERM2I128 $0x13, AA1, BB1, AA1
1032	VPERM2I128 $0x13, CC1, DD1, BB1
1033
1034openAVX2ShortOpen:
1035	// Hash
1036	MOVQ ad_len+80(FP), itr2
1037	CALL polyHashADInternal<>(SB)
1038
1039openAVX2ShortOpenLoop:
1040	CMPQ inl, $32
1041	JB   openAVX2ShortTail32
1042	SUBQ $32, inl
1043
1044	// Load for hashing
1045	polyAdd(0*8(inp))
1046	polyMulAVX2
1047	polyAdd(2*8(inp))
1048	polyMulAVX2
1049
1050	// Load for decryption
1051	VPXOR   (inp), AA0, AA0
1052	VMOVDQU AA0, (oup)
1053	LEAQ    (1*32)(inp), inp
1054	LEAQ    (1*32)(oup), oup
1055
1056	// Shift stream left
1057	VMOVDQA BB0, AA0
1058	VMOVDQA CC0, BB0
1059	VMOVDQA DD0, CC0
1060	VMOVDQA AA1, DD0
1061	VMOVDQA BB1, AA1
1062	VMOVDQA CC1, BB1
1063	VMOVDQA DD1, CC1
1064	VMOVDQA AA2, DD1
1065	VMOVDQA BB2, AA2
1066	JMP     openAVX2ShortOpenLoop
1067
1068openAVX2ShortTail32:
1069	CMPQ    inl, $16
1070	VMOVDQA A0, A1
1071	JB      openAVX2ShortDone
1072
1073	SUBQ $16, inl
1074
1075	// Load for hashing
1076	polyAdd(0*8(inp))
1077	polyMulAVX2
1078
1079	// Load for decryption
1080	VPXOR      (inp), A0, T0
1081	VMOVDQU    T0, (oup)
1082	LEAQ       (1*16)(inp), inp
1083	LEAQ       (1*16)(oup), oup
1084	VPERM2I128 $0x11, AA0, AA0, AA0
1085	VMOVDQA    A0, A1
1086
1087openAVX2ShortDone:
1088	VZEROUPPER
1089	JMP openSSETail16
1090
1091// ----------------------------------------------------------------------------
1092// Special optimization for buffers smaller than 321 bytes
1093openAVX2320:
1094	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1095	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1096	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1097	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1098	MOVQ    $10, itr2
1099
1100openAVX2320InnerCipherLoop:
1101	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1102	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1103	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1104	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1105	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1106	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1107	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1108	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1109	DECQ     itr2
1110	JNE      openAVX2320InnerCipherLoop
1111
1112	VMOVDQA ·chacha20Constants<>(SB), TT0
1113	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1114	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1115	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1116	VMOVDQA ·avx2IncMask<>(SB), TT0
1117	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1118	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1119	VPADDD  TT3, DD2, DD2
1120
1121	// Clamp and store poly key
1122	VPERM2I128 $0x02, AA0, BB0, TT0
1123	VPAND      ·polyClampMask<>(SB), TT0, TT0
1124	VMOVDQA    TT0, rsStoreAVX2
1125
1126	// Stream for up to 320 bytes
1127	VPERM2I128 $0x13, AA0, BB0, AA0
1128	VPERM2I128 $0x13, CC0, DD0, BB0
1129	VPERM2I128 $0x02, AA1, BB1, CC0
1130	VPERM2I128 $0x02, CC1, DD1, DD0
1131	VPERM2I128 $0x13, AA1, BB1, AA1
1132	VPERM2I128 $0x13, CC1, DD1, BB1
1133	VPERM2I128 $0x02, AA2, BB2, CC1
1134	VPERM2I128 $0x02, CC2, DD2, DD1
1135	VPERM2I128 $0x13, AA2, BB2, AA2
1136	VPERM2I128 $0x13, CC2, DD2, BB2
1137	JMP        openAVX2ShortOpen
1138
1139// ----------------------------------------------------------------------------
1140// Special optimization for the last 128 bytes of ciphertext
1141openAVX2Tail128:
1142	// Need to decrypt up to 128 bytes - prepare two blocks
1143	VMOVDQA ·chacha20Constants<>(SB), AA1
1144	VMOVDQA state1StoreAVX2, BB1
1145	VMOVDQA state2StoreAVX2, CC1
1146	VMOVDQA ctr3StoreAVX2, DD1
1147	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
1148	VMOVDQA DD1, DD0
1149
1150	XORQ  itr2, itr2
1151	MOVQ  inl, itr1
1152	ANDQ  $-16, itr1
1153	TESTQ itr1, itr1
1154	JE    openAVX2Tail128LoopB
1155
1156openAVX2Tail128LoopA:
1157	// Perform ChaCha rounds, while hashing the remaining input
1158	polyAdd(0(inp)(itr2*1))
1159	polyMulAVX2
1160
1161openAVX2Tail128LoopB:
1162	ADDQ     $16, itr2
1163	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1164	VPALIGNR $4, BB1, BB1, BB1
1165	VPALIGNR $8, CC1, CC1, CC1
1166	VPALIGNR $12, DD1, DD1, DD1
1167	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1168	VPALIGNR $12, BB1, BB1, BB1
1169	VPALIGNR $8, CC1, CC1, CC1
1170	VPALIGNR $4, DD1, DD1, DD1
1171	CMPQ     itr2, itr1
1172	JB       openAVX2Tail128LoopA
1173	CMPQ     itr2, $160
1174	JNE      openAVX2Tail128LoopB
1175
1176	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
1177	VPADDD     state1StoreAVX2, BB1, BB1
1178	VPADDD     state2StoreAVX2, CC1, CC1
1179	VPADDD     DD0, DD1, DD1
1180	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1181
1182openAVX2TailLoop:
1183	CMPQ inl, $32
1184	JB   openAVX2Tail
1185	SUBQ $32, inl
1186
1187	// Load for decryption
1188	VPXOR   (inp), AA0, AA0
1189	VMOVDQU AA0, (oup)
1190	LEAQ    (1*32)(inp), inp
1191	LEAQ    (1*32)(oup), oup
1192	VMOVDQA BB0, AA0
1193	VMOVDQA CC0, BB0
1194	VMOVDQA DD0, CC0
1195	JMP     openAVX2TailLoop
1196
1197openAVX2Tail:
1198	CMPQ    inl, $16
1199	VMOVDQA A0, A1
1200	JB      openAVX2TailDone
1201	SUBQ    $16, inl
1202
1203	// Load for decryption
1204	VPXOR      (inp), A0, T0
1205	VMOVDQU    T0, (oup)
1206	LEAQ       (1*16)(inp), inp
1207	LEAQ       (1*16)(oup), oup
1208	VPERM2I128 $0x11, AA0, AA0, AA0
1209	VMOVDQA    A0, A1
1210
1211openAVX2TailDone:
1212	VZEROUPPER
1213	JMP openSSETail16
1214
1215// ----------------------------------------------------------------------------
1216// Special optimization for the last 256 bytes of ciphertext
1217openAVX2Tail256:
1218	// Need to decrypt up to 256 bytes - prepare four blocks
1219	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1220	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1221	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1222	VMOVDQA ctr3StoreAVX2, DD0
1223	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
1224	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
1225	VMOVDQA DD0, TT1
1226	VMOVDQA DD1, TT2
1227
1228	// Compute the number of iterations that will hash data
1229	MOVQ    inl, tmpStoreAVX2
1230	MOVQ    inl, itr1
1231	SUBQ    $128, itr1
1232	SHRQ    $4, itr1
1233	MOVQ    $10, itr2
1234	CMPQ    itr1, $10
1235	CMOVQGT itr2, itr1
1236	MOVQ    inp, inl
1237	XORQ    itr2, itr2
1238
1239openAVX2Tail256LoopA:
1240	polyAdd(0(inl))
1241	polyMulAVX2
1242	LEAQ 16(inl), inl
1243
1244	// Perform ChaCha rounds, while hashing the remaining input
1245openAVX2Tail256LoopB:
1246	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1247	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1248	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1249	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1250	INCQ     itr2
1251	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1252	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1253	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1254	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1255	CMPQ     itr2, itr1
1256	JB       openAVX2Tail256LoopA
1257
1258	CMPQ itr2, $10
1259	JNE  openAVX2Tail256LoopB
1260
1261	MOVQ inl, itr2
1262	SUBQ inp, inl
1263	MOVQ inl, itr1
1264	MOVQ tmpStoreAVX2, inl
1265
1266	// Hash the remainder of data (if any)
1267openAVX2Tail256Hash:
1268	ADDQ $16, itr1
1269	CMPQ itr1, inl
1270	JGT  openAVX2Tail256HashEnd
1271	polyAdd (0(itr2))
1272	polyMulAVX2
1273	LEAQ 16(itr2), itr2
1274	JMP  openAVX2Tail256Hash
1275
1276// Store 128 bytes safely, then go to store loop
1277openAVX2Tail256HashEnd:
1278	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1279	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1280	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1281	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1282	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1283	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1284
1285	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1286	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1287	LEAQ    (4*32)(inp), inp
1288	LEAQ    (4*32)(oup), oup
1289	SUBQ    $4*32, inl
1290
1291	JMP openAVX2TailLoop
1292
1293// ----------------------------------------------------------------------------
1294// Special optimization for the last 384 bytes of ciphertext
1295openAVX2Tail384:
1296	// Need to decrypt up to 384 bytes - prepare six blocks
1297	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1298	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1299	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1300	VMOVDQA ctr3StoreAVX2, DD0
1301	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
1302	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
1303	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
1304	VMOVDQA DD0, ctr0StoreAVX2
1305	VMOVDQA DD1, ctr1StoreAVX2
1306	VMOVDQA DD2, ctr2StoreAVX2
1307
1308	// Compute the number of iterations that will hash two blocks of data
1309	MOVQ    inl, tmpStoreAVX2
1310	MOVQ    inl, itr1
1311	SUBQ    $256, itr1
1312	SHRQ    $4, itr1
1313	ADDQ    $6, itr1
1314	MOVQ    $10, itr2
1315	CMPQ    itr1, $10
1316	CMOVQGT itr2, itr1
1317	MOVQ    inp, inl
1318	XORQ    itr2, itr2
1319
1320	// Perform ChaCha rounds, while hashing the remaining input
1321openAVX2Tail384LoopB:
1322	polyAdd(0(inl))
1323	polyMulAVX2
1324	LEAQ 16(inl), inl
1325
1326openAVX2Tail384LoopA:
1327	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1328	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1329	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1330	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1331	polyAdd(0(inl))
1332	polyMulAVX2
1333	LEAQ     16(inl), inl
1334	INCQ     itr2
1335	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1336	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1337	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1338	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1339
1340	CMPQ itr2, itr1
1341	JB   openAVX2Tail384LoopB
1342
1343	CMPQ itr2, $10
1344	JNE  openAVX2Tail384LoopA
1345
1346	MOVQ inl, itr2
1347	SUBQ inp, inl
1348	MOVQ inl, itr1
1349	MOVQ tmpStoreAVX2, inl
1350
1351openAVX2Tail384Hash:
1352	ADDQ $16, itr1
1353	CMPQ itr1, inl
1354	JGT  openAVX2Tail384HashEnd
1355	polyAdd(0(itr2))
1356	polyMulAVX2
1357	LEAQ 16(itr2), itr2
1358	JMP  openAVX2Tail384Hash
1359
1360// Store 256 bytes safely, then go to store loop
1361openAVX2Tail384HashEnd:
1362	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1363	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1364	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1365	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1366	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1367	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1368	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1369	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1370	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1371	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1372	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1373	LEAQ       (8*32)(inp), inp
1374	LEAQ       (8*32)(oup), oup
1375	SUBQ       $8*32, inl
1376	JMP        openAVX2TailLoop
1377
1378// ----------------------------------------------------------------------------
1379// Special optimization for the last 512 bytes of ciphertext
1380openAVX2Tail512:
1381	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1382	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1383	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1384	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1385	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1386	XORQ    itr1, itr1
1387	MOVQ    inp, itr2
1388
1389openAVX2Tail512LoopB:
1390	polyAdd(0(itr2))
1391	polyMulAVX2
1392	LEAQ (2*8)(itr2), itr2
1393
1394openAVX2Tail512LoopA:
1395	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1396	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1397	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1398	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1399	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1400	VMOVDQA  CC3, tmpStoreAVX2
1401	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1402	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1403	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1404	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1405	VMOVDQA  tmpStoreAVX2, CC3
1406	polyAdd(0*8(itr2))
1407	polyMulAVX2
1408	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1409	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1410	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1411	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1412	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1413	VMOVDQA  CC3, tmpStoreAVX2
1414	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1415	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1416	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1417	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1418	VMOVDQA  tmpStoreAVX2, CC3
1419	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1420	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1421	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1422	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1423	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1424	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1425	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1426	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1427	polyAdd(2*8(itr2))
1428	polyMulAVX2
1429	LEAQ     (4*8)(itr2), itr2
1430	VMOVDQA  CC3, tmpStoreAVX2
1431	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1432	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1433	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1434	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1435	VMOVDQA  tmpStoreAVX2, CC3
1436	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1437	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1438	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1439	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1440	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1441	VMOVDQA  CC3, tmpStoreAVX2
1442	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1443	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1444	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1445	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1446	VMOVDQA  tmpStoreAVX2, CC3
1447	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1448	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1449	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1450	INCQ     itr1
1451	CMPQ     itr1, $4
1452	JLT      openAVX2Tail512LoopB
1453
1454	CMPQ itr1, $10
1455	JNE  openAVX2Tail512LoopA
1456
1457	MOVQ inl, itr1
1458	SUBQ $384, itr1
1459	ANDQ $-16, itr1
1460
1461openAVX2Tail512HashLoop:
1462	TESTQ itr1, itr1
1463	JE    openAVX2Tail512HashEnd
1464	polyAdd(0(itr2))
1465	polyMulAVX2
1466	LEAQ  16(itr2), itr2
1467	SUBQ  $16, itr1
1468	JMP   openAVX2Tail512HashLoop
1469
1470openAVX2Tail512HashEnd:
1471	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1472	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1473	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1474	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1475	VMOVDQA    CC3, tmpStoreAVX2
1476	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1477	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1478	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1479	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1480	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1481	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1482	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1483	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1484	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1485	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1486
1487	LEAQ (12*32)(inp), inp
1488	LEAQ (12*32)(oup), oup
1489	SUBQ $12*32, inl
1490
1491	JMP openAVX2TailLoop
1492
1493// ----------------------------------------------------------------------------
1494// ----------------------------------------------------------------------------
1495// func chacha20Poly1305Seal(dst, key, src, ad []byte)
1496TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1497	// For aligned stack access
1498	MOVQ SP, BP
1499	ADDQ $32, BP
1500	ANDQ $-32, BP
1501	MOVQ dst+0(FP), oup
1502	MOVQ key+24(FP), keyp
1503	MOVQ src+48(FP), inp
1504	MOVQ src_len+56(FP), inl
1505	MOVQ ad+72(FP), adp
1506
1507	CMPB ·useAVX2(SB), $1
1508	JE   chacha20Poly1305Seal_AVX2
1509
1510	// Special optimization, for very short buffers
1511	CMPQ inl, $128
1512	JBE  sealSSE128 // About 15% faster
1513
1514	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1515	MOVOU ·chacha20Constants<>(SB), A0
1516	MOVOU (1*16)(keyp), B0
1517	MOVOU (2*16)(keyp), C0
1518	MOVOU (3*16)(keyp), D0
1519
1520	// Store state on stack for future use
1521	MOVO B0, state1Store
1522	MOVO C0, state2Store
1523
1524	// Load state, increment counter blocks
1525	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1526	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1527	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1528
1529	// Store counters
1530	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1531	MOVQ $10, itr2
1532
1533sealSSEIntroLoop:
1534	MOVO         C3, tmpStore
1535	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1536	MOVO         tmpStore, C3
1537	MOVO         C1, tmpStore
1538	chachaQR(A3, B3, C3, D3, C1)
1539	MOVO         tmpStore, C1
1540	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1541	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1542	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1543
1544	MOVO          C3, tmpStore
1545	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1546	MOVO          tmpStore, C3
1547	MOVO          C1, tmpStore
1548	chachaQR(A3, B3, C3, D3, C1)
1549	MOVO          tmpStore, C1
1550	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1551	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1552	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1553	DECQ          itr2
1554	JNE           sealSSEIntroLoop
1555
1556	// Add in the state
1557	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1558	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1559	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1560	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1561
1562	// Clamp and store the key
1563	PAND ·polyClampMask<>(SB), A0
1564	MOVO A0, rStore
1565	MOVO B0, sStore
1566
1567	// Hash AAD
1568	MOVQ ad_len+80(FP), itr2
1569	CALL polyHashADInternal<>(SB)
1570
1571	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1572	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1573	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1574	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1575	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1576	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1577
1578	MOVQ $128, itr1
1579	SUBQ $128, inl
1580	LEAQ 128(inp), inp
1581
1582	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1583
1584	CMPQ inl, $64
1585	JBE  sealSSE128SealHash
1586
1587	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1588	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1589	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1590
1591	ADDQ $64, itr1
1592	SUBQ $64, inl
1593	LEAQ 64(inp), inp
1594
1595	MOVQ $2, itr1
1596	MOVQ $8, itr2
1597
1598	CMPQ inl, $64
1599	JBE  sealSSETail64
1600	CMPQ inl, $128
1601	JBE  sealSSETail128
1602	CMPQ inl, $192
1603	JBE  sealSSETail192
1604
1605sealSSEMainLoop:
1606	// Load state, increment counter blocks
1607	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1608	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1609	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1610	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1611
1612	// Store counters
1613	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1614
1615sealSSEInnerLoop:
1616	MOVO          C3, tmpStore
1617	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1618	MOVO          tmpStore, C3
1619	MOVO          C1, tmpStore
1620	chachaQR(A3, B3, C3, D3, C1)
1621	MOVO          tmpStore, C1
1622	polyAdd(0(oup))
1623	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
1624	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
1625	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
1626	polyMulStage1
1627	polyMulStage2
1628	LEAQ          (2*8)(oup), oup
1629	MOVO          C3, tmpStore
1630	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1631	MOVO          tmpStore, C3
1632	MOVO          C1, tmpStore
1633	polyMulStage3
1634	chachaQR(A3, B3, C3, D3, C1)
1635	MOVO          tmpStore, C1
1636	polyMulReduceStage
1637	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1638	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1639	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1640	DECQ          itr2
1641	JGE           sealSSEInnerLoop
1642	polyAdd(0(oup))
1643	polyMul
1644	LEAQ          (2*8)(oup), oup
1645	DECQ          itr1
1646	JG            sealSSEInnerLoop
1647
1648	// Add in the state
1649	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1650	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1651	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1652	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1653	MOVO  D3, tmpStore
1654
1655	// Load - xor - store
1656	MOVOU (0*16)(inp), D3; PXOR D3, A0
1657	MOVOU (1*16)(inp), D3; PXOR D3, B0
1658	MOVOU (2*16)(inp), D3; PXOR D3, C0
1659	MOVOU (3*16)(inp), D3; PXOR D3, D0
1660	MOVOU A0, (0*16)(oup)
1661	MOVOU B0, (1*16)(oup)
1662	MOVOU C0, (2*16)(oup)
1663	MOVOU D0, (3*16)(oup)
1664	MOVO  tmpStore, D3
1665
1666	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1667	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1668	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1669	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1670	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1671	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1672	ADDQ  $192, inp
1673	MOVQ  $192, itr1
1674	SUBQ  $192, inl
1675	MOVO  A3, A1
1676	MOVO  B3, B1
1677	MOVO  C3, C1
1678	MOVO  D3, D1
1679	CMPQ  inl, $64
1680	JBE   sealSSE128SealHash
1681	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1682	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1683	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1684	LEAQ  64(inp), inp
1685	SUBQ  $64, inl
1686	MOVQ  $6, itr1
1687	MOVQ  $4, itr2
1688	CMPQ  inl, $192
1689	JG    sealSSEMainLoop
1690
1691	MOVQ  inl, itr1
1692	TESTQ inl, inl
1693	JE    sealSSE128SealHash
1694	MOVQ  $6, itr1
1695	CMPQ  inl, $64
1696	JBE   sealSSETail64
1697	CMPQ  inl, $128
1698	JBE   sealSSETail128
1699	JMP   sealSSETail192
1700
1701// ----------------------------------------------------------------------------
1702// Special optimization for the last 64 bytes of plaintext
1703sealSSETail64:
1704	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1705	MOVO  ·chacha20Constants<>(SB), A1
1706	MOVO  state1Store, B1
1707	MOVO  state2Store, C1
1708	MOVO  ctr3Store, D1
1709	PADDL ·sseIncMask<>(SB), D1
1710	MOVO  D1, ctr0Store
1711
1712sealSSETail64LoopA:
1713	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1714	polyAdd(0(oup))
1715	polyMul
1716	LEAQ 16(oup), oup
1717
1718sealSSETail64LoopB:
1719	chachaQR(A1, B1, C1, D1, T1)
1720	shiftB1Left;  shiftC1Left; shiftD1Left
1721	chachaQR(A1, B1, C1, D1, T1)
1722	shiftB1Right; shiftC1Right; shiftD1Right
1723	polyAdd(0(oup))
1724	polyMul
1725	LEAQ          16(oup), oup
1726
1727	DECQ itr1
1728	JG   sealSSETail64LoopA
1729
1730	DECQ  itr2
1731	JGE   sealSSETail64LoopB
1732	PADDL ·chacha20Constants<>(SB), A1
1733	PADDL state1Store, B1
1734	PADDL state2Store, C1
1735	PADDL ctr0Store, D1
1736
1737	JMP sealSSE128Seal
1738
1739// ----------------------------------------------------------------------------
1740// Special optimization for the last 128 bytes of plaintext
1741sealSSETail128:
1742	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1743	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1744	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1745
1746sealSSETail128LoopA:
1747	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1748	polyAdd(0(oup))
1749	polyMul
1750	LEAQ 16(oup), oup
1751
1752sealSSETail128LoopB:
1753	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1754	shiftB0Left;  shiftC0Left; shiftD0Left
1755	shiftB1Left;  shiftC1Left; shiftD1Left
1756	polyAdd(0(oup))
1757	polyMul
1758	LEAQ          16(oup), oup
1759	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1760	shiftB0Right; shiftC0Right; shiftD0Right
1761	shiftB1Right; shiftC1Right; shiftD1Right
1762
1763	DECQ itr1
1764	JG   sealSSETail128LoopA
1765
1766	DECQ itr2
1767	JGE  sealSSETail128LoopB
1768
1769	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1770	PADDL state1Store, B0; PADDL state1Store, B1
1771	PADDL state2Store, C0; PADDL state2Store, C1
1772	PADDL ctr0Store, D0; PADDL ctr1Store, D1
1773
1774	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1775	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1776	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1777
1778	MOVQ $64, itr1
1779	LEAQ 64(inp), inp
1780	SUBQ $64, inl
1781
1782	JMP sealSSE128SealHash
1783
1784// ----------------------------------------------------------------------------
1785// Special optimization for the last 192 bytes of plaintext
1786sealSSETail192:
1787	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1788	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1789	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1790	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1791
1792sealSSETail192LoopA:
1793	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1794	polyAdd(0(oup))
1795	polyMul
1796	LEAQ 16(oup), oup
1797
1798sealSSETail192LoopB:
1799	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1800	shiftB0Left; shiftC0Left; shiftD0Left
1801	shiftB1Left; shiftC1Left; shiftD1Left
1802	shiftB2Left; shiftC2Left; shiftD2Left
1803
1804	polyAdd(0(oup))
1805	polyMul
1806	LEAQ 16(oup), oup
1807
1808	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1809	shiftB0Right; shiftC0Right; shiftD0Right
1810	shiftB1Right; shiftC1Right; shiftD1Right
1811	shiftB2Right; shiftC2Right; shiftD2Right
1812
1813	DECQ itr1
1814	JG   sealSSETail192LoopA
1815
1816	DECQ itr2
1817	JGE  sealSSETail192LoopB
1818
1819	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1820	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1821	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1822	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1823
1824	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1825	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1826	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1827	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1828	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1829	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1830
1831	MOVO A2, A1
1832	MOVO B2, B1
1833	MOVO C2, C1
1834	MOVO D2, D1
1835	MOVQ $128, itr1
1836	LEAQ 128(inp), inp
1837	SUBQ $128, inl
1838
1839	JMP sealSSE128SealHash
1840
1841// ----------------------------------------------------------------------------
1842// Special seal optimization for buffers smaller than 129 bytes
1843sealSSE128:
1844	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1845	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1846	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1847	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1848	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
1849	MOVQ  $10, itr2
1850
1851sealSSE128InnerCipherLoop:
1852	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1853	shiftB0Left;  shiftB1Left; shiftB2Left
1854	shiftC0Left;  shiftC1Left; shiftC2Left
1855	shiftD0Left;  shiftD1Left; shiftD2Left
1856	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1857	shiftB0Right; shiftB1Right; shiftB2Right
1858	shiftC0Right; shiftC1Right; shiftC2Right
1859	shiftD0Right; shiftD1Right; shiftD2Right
1860	DECQ          itr2
1861	JNE           sealSSE128InnerCipherLoop
1862
1863	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1864	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1865	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1866	PADDL T2, C1; PADDL T2, C2
1867	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1868	PAND  ·polyClampMask<>(SB), A0
1869	MOVOU A0, rStore
1870	MOVOU B0, sStore
1871
1872	// Hash
1873	MOVQ ad_len+80(FP), itr2
1874	CALL polyHashADInternal<>(SB)
1875	XORQ itr1, itr1
1876
1877sealSSE128SealHash:
1878	// itr1 holds the number of bytes encrypted but not yet hashed
1879	CMPQ itr1, $16
1880	JB   sealSSE128Seal
1881	polyAdd(0(oup))
1882	polyMul
1883
1884	SUBQ $16, itr1
1885	ADDQ $16, oup
1886
1887	JMP sealSSE128SealHash
1888
1889sealSSE128Seal:
1890	CMPQ inl, $16
1891	JB   sealSSETail
1892	SUBQ $16, inl
1893
1894	// Load for decryption
1895	MOVOU (inp), T0
1896	PXOR  T0, A1
1897	MOVOU A1, (oup)
1898	LEAQ  (1*16)(inp), inp
1899	LEAQ  (1*16)(oup), oup
1900
1901	// Extract for hashing
1902	MOVQ   A1, t0
1903	PSRLDQ $8, A1
1904	MOVQ A1, t1
1905	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1906	polyMul
1907
1908	// Shift the stream "left"
1909	MOVO B1, A1
1910	MOVO C1, B1
1911	MOVO D1, C1
1912	MOVO A2, D1
1913	MOVO B2, A2
1914	MOVO C2, B2
1915	MOVO D2, C2
1916	JMP  sealSSE128Seal
1917
1918sealSSETail:
1919	TESTQ inl, inl
1920	JE    sealSSEFinalize
1921
1922	// We can only load the PT one byte at a time to avoid read after end of buffer
1923	MOVQ inl, itr2
1924	SHLQ $4, itr2
1925	LEAQ ·andMask<>(SB), t0
1926	MOVQ inl, itr1
1927	LEAQ -1(inp)(inl*1), inp
1928	XORQ t2, t2
1929	XORQ t3, t3
1930	XORQ AX, AX
1931
1932sealSSETailLoadLoop:
1933	SHLQ $8, t2, t3
1934	SHLQ $8, t2
1935	MOVB (inp), AX
1936	XORQ AX, t2
1937	LEAQ   -1(inp), inp
1938	DECQ   itr1
1939	JNE    sealSSETailLoadLoop
1940	MOVQ t2, 0+tmpStore
1941	MOVQ t3, 8+tmpStore
1942	PXOR 0+tmpStore, A1
1943	MOVOU  A1, (oup)
1944	MOVOU  -16(t0)(itr2*1), T0
1945	PAND   T0, A1
1946	MOVQ   A1, t0
1947	PSRLDQ $8, A1
1948	MOVQ   A1, t1
1949	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1950	polyMul
1951
1952	ADDQ inl, oup
1953
1954sealSSEFinalize:
1955	// Hash in the buffer lengths
1956	ADDQ ad_len+80(FP), acc0
1957	ADCQ src_len+56(FP), acc1
1958	ADCQ $1, acc2
1959	polyMul
1960
1961	// Final reduce
1962	MOVQ    acc0, t0
1963	MOVQ    acc1, t1
1964	MOVQ    acc2, t2
1965	SUBQ    $-5, acc0
1966	SBBQ    $-1, acc1
1967	SBBQ    $3, acc2
1968	CMOVQCS t0, acc0
1969	CMOVQCS t1, acc1
1970	CMOVQCS t2, acc2
1971
1972	// Add in the "s" part of the key
1973	ADDQ 0+sStore, acc0
1974	ADCQ 8+sStore, acc1
1975
1976	// Finally store the tag at the end of the message
1977	MOVQ acc0, (0*8)(oup)
1978	MOVQ acc1, (1*8)(oup)
1979	RET
1980
1981// ----------------------------------------------------------------------------
1982// ------------------------- AVX2 Code ----------------------------------------
1983chacha20Poly1305Seal_AVX2:
1984	VZEROUPPER
1985	VMOVDQU ·chacha20Constants<>(SB), AA0
1986	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1987	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1988	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1989	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
1990
1991	// Special optimizations, for very short buffers
1992	CMPQ inl, $192
1993	JBE  seal192AVX2 // 33% faster
1994	CMPQ inl, $320
1995	JBE  seal320AVX2 // 17% faster
1996
1997	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1998	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1999	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
2000	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
2001	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
2002	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
2003	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
2004	VMOVDQA DD3, ctr3StoreAVX2
2005	MOVQ    $10, itr2
2006
2007sealAVX2IntroLoop:
2008	VMOVDQA CC3, tmpStoreAVX2
2009	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2010	VMOVDQA tmpStoreAVX2, CC3
2011	VMOVDQA CC1, tmpStoreAVX2
2012	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2013	VMOVDQA tmpStoreAVX2, CC1
2014
2015	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2016	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2017	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2018	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2019
2020	VMOVDQA CC3, tmpStoreAVX2
2021	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2022	VMOVDQA tmpStoreAVX2, CC3
2023	VMOVDQA CC1, tmpStoreAVX2
2024	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2025	VMOVDQA tmpStoreAVX2, CC1
2026
2027	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2028	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2029	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2030	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2031	DECQ     itr2
2032	JNE      sealAVX2IntroLoop
2033
2034	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2035	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2036	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2037	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2038
2039	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2040	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2041	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2042
2043	// Clamp and store poly key
2044	VPAND   ·polyClampMask<>(SB), DD0, DD0
2045	VMOVDQA DD0, rsStoreAVX2
2046
2047	// Hash AD
2048	MOVQ ad_len+80(FP), itr2
2049	CALL polyHashADInternal<>(SB)
2050
2051	// Can store at least 320 bytes
2052	VPXOR   (0*32)(inp), AA0, AA0
2053	VPXOR   (1*32)(inp), CC0, CC0
2054	VMOVDQU AA0, (0*32)(oup)
2055	VMOVDQU CC0, (1*32)(oup)
2056
2057	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2058	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2059	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2060	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2061	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2062	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2063
2064	MOVQ $320, itr1
2065	SUBQ $320, inl
2066	LEAQ 320(inp), inp
2067
2068	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2069	CMPQ       inl, $128
2070	JBE        sealAVX2SealHash
2071
2072	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2073	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2074	SUBQ    $128, inl
2075	LEAQ    128(inp), inp
2076
2077	MOVQ $8, itr1
2078	MOVQ $2, itr2
2079
2080	CMPQ inl, $128
2081	JBE  sealAVX2Tail128
2082	CMPQ inl, $256
2083	JBE  sealAVX2Tail256
2084	CMPQ inl, $384
2085	JBE  sealAVX2Tail384
2086	CMPQ inl, $512
2087	JBE  sealAVX2Tail512
2088
2089	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2090	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2091	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2092	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2093	VMOVDQA ctr3StoreAVX2, DD0
2094	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2095	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2096
2097	VMOVDQA CC3, tmpStoreAVX2
2098	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2099	VMOVDQA tmpStoreAVX2, CC3
2100	VMOVDQA CC1, tmpStoreAVX2
2101	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2102	VMOVDQA tmpStoreAVX2, CC1
2103
2104	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2105	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2106	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2107	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2108
2109	VMOVDQA CC3, tmpStoreAVX2
2110	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2111	VMOVDQA tmpStoreAVX2, CC3
2112	VMOVDQA CC1, tmpStoreAVX2
2113	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2114	VMOVDQA tmpStoreAVX2, CC1
2115
2116	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2117	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2118	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2119	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2120	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2121	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2122	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2123	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2124	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2125	VMOVDQA  CC3, tmpStoreAVX2
2126	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2127	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2128	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2129	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2130	VMOVDQA  tmpStoreAVX2, CC3
2131
2132	SUBQ $16, oup                  // Adjust the pointer
2133	MOVQ $9, itr1
2134	JMP  sealAVX2InternalLoopStart
2135
2136sealAVX2MainLoop:
2137	// Load state, increment counter blocks, store the incremented counters
2138	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2139	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2140	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2141	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2142	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2143	MOVQ    $10, itr1
2144
2145sealAVX2InternalLoop:
2146	polyAdd(0*8(oup))
2147	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2148	polyMulStage1_AVX2
2149	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2150	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2151	polyMulStage2_AVX2
2152	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2153	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2154	polyMulStage3_AVX2
2155	VMOVDQA CC3, tmpStoreAVX2
2156	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2157	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2158	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2159	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2160	VMOVDQA tmpStoreAVX2, CC3
2161	polyMulReduceStage
2162
2163sealAVX2InternalLoopStart:
2164	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2165	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2166	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2167	polyAdd(2*8(oup))
2168	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2169	polyMulStage1_AVX2
2170	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2171	VMOVDQA  CC3, tmpStoreAVX2
2172	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2173	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2174	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2175	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2176	VMOVDQA  tmpStoreAVX2, CC3
2177	polyMulStage2_AVX2
2178	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2179	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2180	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2181	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2182	polyMulStage3_AVX2
2183	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2184	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2185	polyMulReduceStage
2186	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2187	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2188	polyAdd(4*8(oup))
2189	LEAQ     (6*8)(oup), oup
2190	VMOVDQA  CC3, tmpStoreAVX2
2191	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2192	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2193	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2194	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2195	VMOVDQA  tmpStoreAVX2, CC3
2196	polyMulStage1_AVX2
2197	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2198	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2199	polyMulStage2_AVX2
2200	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2201	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2202	polyMulStage3_AVX2
2203	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2204	VMOVDQA  CC3, tmpStoreAVX2
2205	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2206	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2207	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2208	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2209	VMOVDQA  tmpStoreAVX2, CC3
2210	polyMulReduceStage
2211	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2212	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2213	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2214	DECQ     itr1
2215	JNE      sealAVX2InternalLoop
2216
2217	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2218	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2219	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2220	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2221	VMOVDQA CC3, tmpStoreAVX2
2222
2223	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2224	polyAdd(0*8(oup))
2225	polyMulAVX2
2226	LEAQ       (4*8)(oup), oup
2227	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2228	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2229	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2230	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2231	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2232	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2233
2234	// and here
2235	polyAdd(-2*8(oup))
2236	polyMulAVX2
2237	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2238	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2239	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2240	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2241	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2242	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2243	LEAQ       (32*16)(inp), inp
2244	SUBQ       $(32*16), inl
2245	CMPQ       inl, $512
2246	JG         sealAVX2MainLoop
2247
2248	// Tail can only hash 480 bytes
2249	polyAdd(0*8(oup))
2250	polyMulAVX2
2251	polyAdd(2*8(oup))
2252	polyMulAVX2
2253	LEAQ 32(oup), oup
2254
2255	MOVQ $10, itr1
2256	MOVQ $0, itr2
2257	CMPQ inl, $128
2258	JBE  sealAVX2Tail128
2259	CMPQ inl, $256
2260	JBE  sealAVX2Tail256
2261	CMPQ inl, $384
2262	JBE  sealAVX2Tail384
2263	JMP  sealAVX2Tail512
2264
2265// ----------------------------------------------------------------------------
2266// Special optimization for buffers smaller than 193 bytes
2267seal192AVX2:
2268	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2269	VMOVDQA AA0, AA1
2270	VMOVDQA BB0, BB1
2271	VMOVDQA CC0, CC1
2272	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
2273	VMOVDQA AA0, AA2
2274	VMOVDQA BB0, BB2
2275	VMOVDQA CC0, CC2
2276	VMOVDQA DD0, DD2
2277	VMOVDQA DD1, TT3
2278	MOVQ    $10, itr2
2279
2280sealAVX2192InnerCipherLoop:
2281	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2282	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2283	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2284	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2285	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2286	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2287	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2288	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2289	DECQ       itr2
2290	JNE        sealAVX2192InnerCipherLoop
2291	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2292	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2293	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2294	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2295	VPERM2I128 $0x02, AA0, BB0, TT0
2296
2297	// Clamp and store poly key
2298	VPAND   ·polyClampMask<>(SB), TT0, TT0
2299	VMOVDQA TT0, rsStoreAVX2
2300
2301	// Stream for up to 192 bytes
2302	VPERM2I128 $0x13, AA0, BB0, AA0
2303	VPERM2I128 $0x13, CC0, DD0, BB0
2304	VPERM2I128 $0x02, AA1, BB1, CC0
2305	VPERM2I128 $0x02, CC1, DD1, DD0
2306	VPERM2I128 $0x13, AA1, BB1, AA1
2307	VPERM2I128 $0x13, CC1, DD1, BB1
2308
2309sealAVX2ShortSeal:
2310	// Hash aad
2311	MOVQ ad_len+80(FP), itr2
2312	CALL polyHashADInternal<>(SB)
2313	XORQ itr1, itr1
2314
2315sealAVX2SealHash:
2316	// itr1 holds the number of bytes encrypted but not yet hashed
2317	CMPQ itr1, $16
2318	JB   sealAVX2ShortSealLoop
2319	polyAdd(0(oup))
2320	polyMul
2321	SUBQ $16, itr1
2322	ADDQ $16, oup
2323	JMP  sealAVX2SealHash
2324
2325sealAVX2ShortSealLoop:
2326	CMPQ inl, $32
2327	JB   sealAVX2ShortTail32
2328	SUBQ $32, inl
2329
2330	// Load for encryption
2331	VPXOR   (inp), AA0, AA0
2332	VMOVDQU AA0, (oup)
2333	LEAQ    (1*32)(inp), inp
2334
2335	// Now can hash
2336	polyAdd(0*8(oup))
2337	polyMulAVX2
2338	polyAdd(2*8(oup))
2339	polyMulAVX2
2340	LEAQ (1*32)(oup), oup
2341
2342	// Shift stream left
2343	VMOVDQA BB0, AA0
2344	VMOVDQA CC0, BB0
2345	VMOVDQA DD0, CC0
2346	VMOVDQA AA1, DD0
2347	VMOVDQA BB1, AA1
2348	VMOVDQA CC1, BB1
2349	VMOVDQA DD1, CC1
2350	VMOVDQA AA2, DD1
2351	VMOVDQA BB2, AA2
2352	JMP     sealAVX2ShortSealLoop
2353
2354sealAVX2ShortTail32:
2355	CMPQ    inl, $16
2356	VMOVDQA A0, A1
2357	JB      sealAVX2ShortDone
2358
2359	SUBQ $16, inl
2360
2361	// Load for encryption
2362	VPXOR   (inp), A0, T0
2363	VMOVDQU T0, (oup)
2364	LEAQ    (1*16)(inp), inp
2365
2366	// Hash
2367	polyAdd(0*8(oup))
2368	polyMulAVX2
2369	LEAQ       (1*16)(oup), oup
2370	VPERM2I128 $0x11, AA0, AA0, AA0
2371	VMOVDQA    A0, A1
2372
2373sealAVX2ShortDone:
2374	VZEROUPPER
2375	JMP sealSSETail
2376
2377// ----------------------------------------------------------------------------
2378// Special optimization for buffers smaller than 321 bytes
2379seal320AVX2:
2380	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2381	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2382	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2383	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2384	MOVQ    $10, itr2
2385
2386sealAVX2320InnerCipherLoop:
2387	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2388	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2389	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2390	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2391	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2392	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2393	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2394	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2395	DECQ     itr2
2396	JNE      sealAVX2320InnerCipherLoop
2397
2398	VMOVDQA ·chacha20Constants<>(SB), TT0
2399	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2400	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2401	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2402	VMOVDQA ·avx2IncMask<>(SB), TT0
2403	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2404	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2405	VPADDD  TT3, DD2, DD2
2406
2407	// Clamp and store poly key
2408	VPERM2I128 $0x02, AA0, BB0, TT0
2409	VPAND      ·polyClampMask<>(SB), TT0, TT0
2410	VMOVDQA    TT0, rsStoreAVX2
2411
2412	// Stream for up to 320 bytes
2413	VPERM2I128 $0x13, AA0, BB0, AA0
2414	VPERM2I128 $0x13, CC0, DD0, BB0
2415	VPERM2I128 $0x02, AA1, BB1, CC0
2416	VPERM2I128 $0x02, CC1, DD1, DD0
2417	VPERM2I128 $0x13, AA1, BB1, AA1
2418	VPERM2I128 $0x13, CC1, DD1, BB1
2419	VPERM2I128 $0x02, AA2, BB2, CC1
2420	VPERM2I128 $0x02, CC2, DD2, DD1
2421	VPERM2I128 $0x13, AA2, BB2, AA2
2422	VPERM2I128 $0x13, CC2, DD2, BB2
2423	JMP        sealAVX2ShortSeal
2424
2425// ----------------------------------------------------------------------------
2426// Special optimization for the last 128 bytes of ciphertext
2427sealAVX2Tail128:
2428	// Need to decrypt up to 128 bytes - prepare two blocks
2429	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2430	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2431	VMOVDQA ·chacha20Constants<>(SB), AA0
2432	VMOVDQA state1StoreAVX2, BB0
2433	VMOVDQA state2StoreAVX2, CC0
2434	VMOVDQA ctr3StoreAVX2, DD0
2435	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
2436	VMOVDQA DD0, DD1
2437
2438sealAVX2Tail128LoopA:
2439	polyAdd(0(oup))
2440	polyMul
2441	LEAQ 16(oup), oup
2442
2443sealAVX2Tail128LoopB:
2444	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2445	polyAdd(0(oup))
2446	polyMul
2447	VPALIGNR $4, BB0, BB0, BB0
2448	VPALIGNR $8, CC0, CC0, CC0
2449	VPALIGNR $12, DD0, DD0, DD0
2450	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2451	polyAdd(16(oup))
2452	polyMul
2453	LEAQ     32(oup), oup
2454	VPALIGNR $12, BB0, BB0, BB0
2455	VPALIGNR $8, CC0, CC0, CC0
2456	VPALIGNR $4, DD0, DD0, DD0
2457	DECQ     itr1
2458	JG       sealAVX2Tail128LoopA
2459	DECQ     itr2
2460	JGE      sealAVX2Tail128LoopB
2461
2462	VPADDD ·chacha20Constants<>(SB), AA0, AA1
2463	VPADDD state1StoreAVX2, BB0, BB1
2464	VPADDD state2StoreAVX2, CC0, CC1
2465	VPADDD DD1, DD0, DD1
2466
2467	VPERM2I128 $0x02, AA1, BB1, AA0
2468	VPERM2I128 $0x02, CC1, DD1, BB0
2469	VPERM2I128 $0x13, AA1, BB1, CC0
2470	VPERM2I128 $0x13, CC1, DD1, DD0
2471	JMP        sealAVX2ShortSealLoop
2472
2473// ----------------------------------------------------------------------------
2474// Special optimization for the last 256 bytes of ciphertext
2475sealAVX2Tail256:
2476	// Need to decrypt up to 256 bytes - prepare two blocks
2477	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2478	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2479	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2480	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2481	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2482	VMOVDQA ctr3StoreAVX2, DD0
2483	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
2484	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
2485	VMOVDQA DD0, TT1
2486	VMOVDQA DD1, TT2
2487
2488sealAVX2Tail256LoopA:
2489	polyAdd(0(oup))
2490	polyMul
2491	LEAQ 16(oup), oup
2492
2493sealAVX2Tail256LoopB:
2494	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2495	polyAdd(0(oup))
2496	polyMul
2497	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2498	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2499	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2500	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2501	polyAdd(16(oup))
2502	polyMul
2503	LEAQ     32(oup), oup
2504	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2505	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2506	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2507	DECQ     itr1
2508	JG       sealAVX2Tail256LoopA
2509	DECQ     itr2
2510	JGE      sealAVX2Tail256LoopB
2511
2512	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2513	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2514	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2515	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2516	VPERM2I128 $0x02, AA0, BB0, TT0
2517	VPERM2I128 $0x02, CC0, DD0, TT1
2518	VPERM2I128 $0x13, AA0, BB0, TT2
2519	VPERM2I128 $0x13, CC0, DD0, TT3
2520	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2521	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2522	MOVQ       $128, itr1
2523	LEAQ       128(inp), inp
2524	SUBQ       $128, inl
2525	VPERM2I128 $0x02, AA1, BB1, AA0
2526	VPERM2I128 $0x02, CC1, DD1, BB0
2527	VPERM2I128 $0x13, AA1, BB1, CC0
2528	VPERM2I128 $0x13, CC1, DD1, DD0
2529
2530	JMP sealAVX2SealHash
2531
2532// ----------------------------------------------------------------------------
2533// Special optimization for the last 384 bytes of ciphertext
2534sealAVX2Tail384:
2535	// Need to decrypt up to 384 bytes - prepare two blocks
2536	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2537	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2538	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2539	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2540	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2541	VMOVDQA ctr3StoreAVX2, DD0
2542	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2543	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2544
2545sealAVX2Tail384LoopA:
2546	polyAdd(0(oup))
2547	polyMul
2548	LEAQ 16(oup), oup
2549
2550sealAVX2Tail384LoopB:
2551	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2552	polyAdd(0(oup))
2553	polyMul
2554	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2555	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2556	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2557	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2558	polyAdd(16(oup))
2559	polyMul
2560	LEAQ     32(oup), oup
2561	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2562	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2563	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2564	DECQ     itr1
2565	JG       sealAVX2Tail384LoopA
2566	DECQ     itr2
2567	JGE      sealAVX2Tail384LoopB
2568
2569	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2570	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2571	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2572	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2573	VPERM2I128 $0x02, AA0, BB0, TT0
2574	VPERM2I128 $0x02, CC0, DD0, TT1
2575	VPERM2I128 $0x13, AA0, BB0, TT2
2576	VPERM2I128 $0x13, CC0, DD0, TT3
2577	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2578	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2579	VPERM2I128 $0x02, AA1, BB1, TT0
2580	VPERM2I128 $0x02, CC1, DD1, TT1
2581	VPERM2I128 $0x13, AA1, BB1, TT2
2582	VPERM2I128 $0x13, CC1, DD1, TT3
2583	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2584	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2585	MOVQ       $256, itr1
2586	LEAQ       256(inp), inp
2587	SUBQ       $256, inl
2588	VPERM2I128 $0x02, AA2, BB2, AA0
2589	VPERM2I128 $0x02, CC2, DD2, BB0
2590	VPERM2I128 $0x13, AA2, BB2, CC0
2591	VPERM2I128 $0x13, CC2, DD2, DD0
2592
2593	JMP sealAVX2SealHash
2594
2595// ----------------------------------------------------------------------------
2596// Special optimization for the last 512 bytes of ciphertext
2597sealAVX2Tail512:
2598	// Need to decrypt up to 512 bytes - prepare two blocks
2599	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2600	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2601	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2602	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2603	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2604	VMOVDQA ctr3StoreAVX2, DD0
2605	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2606	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2607
2608sealAVX2Tail512LoopA:
2609	polyAdd(0(oup))
2610	polyMul
2611	LEAQ 16(oup), oup
2612
2613sealAVX2Tail512LoopB:
2614	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2615	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2616	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2617	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2618	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2619	VMOVDQA  CC3, tmpStoreAVX2
2620	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2621	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2622	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2623	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2624	VMOVDQA  tmpStoreAVX2, CC3
2625	polyAdd(0*8(oup))
2626	polyMulAVX2
2627	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2628	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2629	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2630	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2631	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2632	VMOVDQA  CC3, tmpStoreAVX2
2633	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2634	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2635	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2636	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2637	VMOVDQA  tmpStoreAVX2, CC3
2638	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2639	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2640	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2641	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2642	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2643	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2644	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2645	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2646	polyAdd(2*8(oup))
2647	polyMulAVX2
2648	LEAQ     (4*8)(oup), oup
2649	VMOVDQA  CC3, tmpStoreAVX2
2650	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2651	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2652	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2653	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2654	VMOVDQA  tmpStoreAVX2, CC3
2655	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2656	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2657	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2658	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2659	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2660	VMOVDQA  CC3, tmpStoreAVX2
2661	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2662	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2663	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2664	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2665	VMOVDQA  tmpStoreAVX2, CC3
2666	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2667	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2668	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2669
2670	DECQ itr1
2671	JG   sealAVX2Tail512LoopA
2672	DECQ itr2
2673	JGE  sealAVX2Tail512LoopB
2674
2675	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2676	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2677	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2678	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2679	VMOVDQA    CC3, tmpStoreAVX2
2680	VPERM2I128 $0x02, AA0, BB0, CC3
2681	VPXOR      (0*32)(inp), CC3, CC3
2682	VMOVDQU    CC3, (0*32)(oup)
2683	VPERM2I128 $0x02, CC0, DD0, CC3
2684	VPXOR      (1*32)(inp), CC3, CC3
2685	VMOVDQU    CC3, (1*32)(oup)
2686	VPERM2I128 $0x13, AA0, BB0, CC3
2687	VPXOR      (2*32)(inp), CC3, CC3
2688	VMOVDQU    CC3, (2*32)(oup)
2689	VPERM2I128 $0x13, CC0, DD0, CC3
2690	VPXOR      (3*32)(inp), CC3, CC3
2691	VMOVDQU    CC3, (3*32)(oup)
2692
2693	VPERM2I128 $0x02, AA1, BB1, AA0
2694	VPERM2I128 $0x02, CC1, DD1, BB0
2695	VPERM2I128 $0x13, AA1, BB1, CC0
2696	VPERM2I128 $0x13, CC1, DD1, DD0
2697	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2698	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2699
2700	VPERM2I128 $0x02, AA2, BB2, AA0
2701	VPERM2I128 $0x02, CC2, DD2, BB0
2702	VPERM2I128 $0x13, AA2, BB2, CC0
2703	VPERM2I128 $0x13, CC2, DD2, DD0
2704	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2705	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2706
2707	MOVQ       $384, itr1
2708	LEAQ       384(inp), inp
2709	SUBQ       $384, inl
2710	VPERM2I128 $0x02, AA3, BB3, AA0
2711	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2712	VPERM2I128 $0x13, AA3, BB3, CC0
2713	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2714
2715	JMP sealAVX2SealHash
2716