• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7
8
9ALIGN	32
10_aesni_ctr32_ghash_6x:
11	vmovdqu	xmm2,XMMWORD[32+r11]
12	sub	rdx,6
13	vpxor	xmm4,xmm4,xmm4
14	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
15	vpaddb	xmm10,xmm1,xmm2
16	vpaddb	xmm11,xmm10,xmm2
17	vpaddb	xmm12,xmm11,xmm2
18	vpaddb	xmm13,xmm12,xmm2
19	vpaddb	xmm14,xmm13,xmm2
20	vpxor	xmm9,xmm1,xmm15
21	vmovdqu	XMMWORD[(16+8)+rsp],xmm4
22	jmp	NEAR $L$oop6x
23
24ALIGN	32
25$L$oop6x:
26	add	ebx,100663296
27	jc	NEAR $L$handle_ctr32
28	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
29	vpaddb	xmm1,xmm14,xmm2
30	vpxor	xmm10,xmm10,xmm15
31	vpxor	xmm11,xmm11,xmm15
32
33$L$resume_ctr32:
34	vmovdqu	XMMWORD[r8],xmm1
35	vpclmulqdq	xmm5,xmm7,xmm3,0x10
36	vpxor	xmm12,xmm12,xmm15
37	vmovups	xmm2,XMMWORD[((16-128))+rcx]
38	vpclmulqdq	xmm6,xmm7,xmm3,0x01
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56	xor	r12,r12
57	cmp	r15,r14
58
59	vaesenc	xmm9,xmm9,xmm2
60	vmovdqu	xmm0,XMMWORD[((48+8))+rsp]
61	vpxor	xmm13,xmm13,xmm15
62	vpclmulqdq	xmm1,xmm7,xmm3,0x00
63	vaesenc	xmm10,xmm10,xmm2
64	vpxor	xmm14,xmm14,xmm15
65	setnc	r12b
66	vpclmulqdq	xmm7,xmm7,xmm3,0x11
67	vaesenc	xmm11,xmm11,xmm2
68	vmovdqu	xmm3,XMMWORD[((16-32))+r9]
69	neg	r12
70	vaesenc	xmm12,xmm12,xmm2
71	vpxor	xmm6,xmm6,xmm5
72	vpclmulqdq	xmm5,xmm0,xmm3,0x00
73	vpxor	xmm8,xmm8,xmm4
74	vaesenc	xmm13,xmm13,xmm2
75	vpxor	xmm4,xmm1,xmm5
76	and	r12,0x60
77	vmovups	xmm15,XMMWORD[((32-128))+rcx]
78	vpclmulqdq	xmm1,xmm0,xmm3,0x10
79	vaesenc	xmm14,xmm14,xmm2
80
81	vpclmulqdq	xmm2,xmm0,xmm3,0x01
82	lea	r14,[r12*1+r14]
83	vaesenc	xmm9,xmm9,xmm15
84	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
85	vpclmulqdq	xmm3,xmm0,xmm3,0x11
86	vmovdqu	xmm0,XMMWORD[((64+8))+rsp]
87	vaesenc	xmm10,xmm10,xmm15
88	movbe	r13,QWORD[88+r14]
89	vaesenc	xmm11,xmm11,xmm15
90	movbe	r12,QWORD[80+r14]
91	vaesenc	xmm12,xmm12,xmm15
92	mov	QWORD[((32+8))+rsp],r13
93	vaesenc	xmm13,xmm13,xmm15
94	mov	QWORD[((40+8))+rsp],r12
95	vmovdqu	xmm5,XMMWORD[((48-32))+r9]
96	vaesenc	xmm14,xmm14,xmm15
97
98	vmovups	xmm15,XMMWORD[((48-128))+rcx]
99	vpxor	xmm6,xmm6,xmm1
100	vpclmulqdq	xmm1,xmm0,xmm5,0x00
101	vaesenc	xmm9,xmm9,xmm15
102	vpxor	xmm6,xmm6,xmm2
103	vpclmulqdq	xmm2,xmm0,xmm5,0x10
104	vaesenc	xmm10,xmm10,xmm15
105	vpxor	xmm7,xmm7,xmm3
106	vpclmulqdq	xmm3,xmm0,xmm5,0x01
107	vaesenc	xmm11,xmm11,xmm15
108	vpclmulqdq	xmm5,xmm0,xmm5,0x11
109	vmovdqu	xmm0,XMMWORD[((80+8))+rsp]
110	vaesenc	xmm12,xmm12,xmm15
111	vaesenc	xmm13,xmm13,xmm15
112	vpxor	xmm4,xmm4,xmm1
113	vmovdqu	xmm1,XMMWORD[((64-32))+r9]
114	vaesenc	xmm14,xmm14,xmm15
115
116	vmovups	xmm15,XMMWORD[((64-128))+rcx]
117	vpxor	xmm6,xmm6,xmm2
118	vpclmulqdq	xmm2,xmm0,xmm1,0x00
119	vaesenc	xmm9,xmm9,xmm15
120	vpxor	xmm6,xmm6,xmm3
121	vpclmulqdq	xmm3,xmm0,xmm1,0x10
122	vaesenc	xmm10,xmm10,xmm15
123	movbe	r13,QWORD[72+r14]
124	vpxor	xmm7,xmm7,xmm5
125	vpclmulqdq	xmm5,xmm0,xmm1,0x01
126	vaesenc	xmm11,xmm11,xmm15
127	movbe	r12,QWORD[64+r14]
128	vpclmulqdq	xmm1,xmm0,xmm1,0x11
129	vmovdqu	xmm0,XMMWORD[((96+8))+rsp]
130	vaesenc	xmm12,xmm12,xmm15
131	mov	QWORD[((48+8))+rsp],r13
132	vaesenc	xmm13,xmm13,xmm15
133	mov	QWORD[((56+8))+rsp],r12
134	vpxor	xmm4,xmm4,xmm2
135	vmovdqu	xmm2,XMMWORD[((96-32))+r9]
136	vaesenc	xmm14,xmm14,xmm15
137
138	vmovups	xmm15,XMMWORD[((80-128))+rcx]
139	vpxor	xmm6,xmm6,xmm3
140	vpclmulqdq	xmm3,xmm0,xmm2,0x00
141	vaesenc	xmm9,xmm9,xmm15
142	vpxor	xmm6,xmm6,xmm5
143	vpclmulqdq	xmm5,xmm0,xmm2,0x10
144	vaesenc	xmm10,xmm10,xmm15
145	movbe	r13,QWORD[56+r14]
146	vpxor	xmm7,xmm7,xmm1
147	vpclmulqdq	xmm1,xmm0,xmm2,0x01
148	vpxor	xmm8,xmm8,XMMWORD[((112+8))+rsp]
149	vaesenc	xmm11,xmm11,xmm15
150	movbe	r12,QWORD[48+r14]
151	vpclmulqdq	xmm2,xmm0,xmm2,0x11
152	vaesenc	xmm12,xmm12,xmm15
153	mov	QWORD[((64+8))+rsp],r13
154	vaesenc	xmm13,xmm13,xmm15
155	mov	QWORD[((72+8))+rsp],r12
156	vpxor	xmm4,xmm4,xmm3
157	vmovdqu	xmm3,XMMWORD[((112-32))+r9]
158	vaesenc	xmm14,xmm14,xmm15
159
160	vmovups	xmm15,XMMWORD[((96-128))+rcx]
161	vpxor	xmm6,xmm6,xmm5
162	vpclmulqdq	xmm5,xmm8,xmm3,0x10
163	vaesenc	xmm9,xmm9,xmm15
164	vpxor	xmm6,xmm6,xmm1
165	vpclmulqdq	xmm1,xmm8,xmm3,0x01
166	vaesenc	xmm10,xmm10,xmm15
167	movbe	r13,QWORD[40+r14]
168	vpxor	xmm7,xmm7,xmm2
169	vpclmulqdq	xmm2,xmm8,xmm3,0x00
170	vaesenc	xmm11,xmm11,xmm15
171	movbe	r12,QWORD[32+r14]
172	vpclmulqdq	xmm8,xmm8,xmm3,0x11
173	vaesenc	xmm12,xmm12,xmm15
174	mov	QWORD[((80+8))+rsp],r13
175	vaesenc	xmm13,xmm13,xmm15
176	mov	QWORD[((88+8))+rsp],r12
177	vpxor	xmm6,xmm6,xmm5
178	vaesenc	xmm14,xmm14,xmm15
179	vpxor	xmm6,xmm6,xmm1
180
181	vmovups	xmm15,XMMWORD[((112-128))+rcx]
182	vpslldq	xmm5,xmm6,8
183	vpxor	xmm4,xmm4,xmm2
184	vmovdqu	xmm3,XMMWORD[16+r11]
185
186	vaesenc	xmm9,xmm9,xmm15
187	vpxor	xmm7,xmm7,xmm8
188	vaesenc	xmm10,xmm10,xmm15
189	vpxor	xmm4,xmm4,xmm5
190	movbe	r13,QWORD[24+r14]
191	vaesenc	xmm11,xmm11,xmm15
192	movbe	r12,QWORD[16+r14]
193	vpalignr	xmm0,xmm4,xmm4,8
194	vpclmulqdq	xmm4,xmm4,xmm3,0x10
195	mov	QWORD[((96+8))+rsp],r13
196	vaesenc	xmm12,xmm12,xmm15
197	mov	QWORD[((104+8))+rsp],r12
198	vaesenc	xmm13,xmm13,xmm15
199	vmovups	xmm1,XMMWORD[((128-128))+rcx]
200	vaesenc	xmm14,xmm14,xmm15
201
202	vaesenc	xmm9,xmm9,xmm1
203	vmovups	xmm15,XMMWORD[((144-128))+rcx]
204	vaesenc	xmm10,xmm10,xmm1
205	vpsrldq	xmm6,xmm6,8
206	vaesenc	xmm11,xmm11,xmm1
207	vpxor	xmm7,xmm7,xmm6
208	vaesenc	xmm12,xmm12,xmm1
209	vpxor	xmm4,xmm4,xmm0
210	movbe	r13,QWORD[8+r14]
211	vaesenc	xmm13,xmm13,xmm1
212	movbe	r12,QWORD[r14]
213	vaesenc	xmm14,xmm14,xmm1
214	vmovups	xmm1,XMMWORD[((160-128))+rcx]
215	cmp	ebp,11
216	jb	NEAR $L$enc_tail
217
218	vaesenc	xmm9,xmm9,xmm15
219	vaesenc	xmm10,xmm10,xmm15
220	vaesenc	xmm11,xmm11,xmm15
221	vaesenc	xmm12,xmm12,xmm15
222	vaesenc	xmm13,xmm13,xmm15
223	vaesenc	xmm14,xmm14,xmm15
224
225	vaesenc	xmm9,xmm9,xmm1
226	vaesenc	xmm10,xmm10,xmm1
227	vaesenc	xmm11,xmm11,xmm1
228	vaesenc	xmm12,xmm12,xmm1
229	vaesenc	xmm13,xmm13,xmm1
230	vmovups	xmm15,XMMWORD[((176-128))+rcx]
231	vaesenc	xmm14,xmm14,xmm1
232	vmovups	xmm1,XMMWORD[((192-128))+rcx]
233	je	NEAR $L$enc_tail
234
235	vaesenc	xmm9,xmm9,xmm15
236	vaesenc	xmm10,xmm10,xmm15
237	vaesenc	xmm11,xmm11,xmm15
238	vaesenc	xmm12,xmm12,xmm15
239	vaesenc	xmm13,xmm13,xmm15
240	vaesenc	xmm14,xmm14,xmm15
241
242	vaesenc	xmm9,xmm9,xmm1
243	vaesenc	xmm10,xmm10,xmm1
244	vaesenc	xmm11,xmm11,xmm1
245	vaesenc	xmm12,xmm12,xmm1
246	vaesenc	xmm13,xmm13,xmm1
247	vmovups	xmm15,XMMWORD[((208-128))+rcx]
248	vaesenc	xmm14,xmm14,xmm1
249	vmovups	xmm1,XMMWORD[((224-128))+rcx]
250	jmp	NEAR $L$enc_tail
251
252ALIGN	32
253$L$handle_ctr32:
254	vmovdqu	xmm0,XMMWORD[r11]
255	vpshufb	xmm6,xmm1,xmm0
256	vmovdqu	xmm5,XMMWORD[48+r11]
257	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
258	vpaddd	xmm11,xmm6,xmm5
259	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
260	vpaddd	xmm12,xmm10,xmm5
261	vpshufb	xmm10,xmm10,xmm0
262	vpaddd	xmm13,xmm11,xmm5
263	vpshufb	xmm11,xmm11,xmm0
264	vpxor	xmm10,xmm10,xmm15
265	vpaddd	xmm14,xmm12,xmm5
266	vpshufb	xmm12,xmm12,xmm0
267	vpxor	xmm11,xmm11,xmm15
268	vpaddd	xmm1,xmm13,xmm5
269	vpshufb	xmm13,xmm13,xmm0
270	vpshufb	xmm14,xmm14,xmm0
271	vpshufb	xmm1,xmm1,xmm0
272	jmp	NEAR $L$resume_ctr32
273
274ALIGN	32
275$L$enc_tail:
276	vaesenc	xmm9,xmm9,xmm15
277	vmovdqu	XMMWORD[(16+8)+rsp],xmm7
278	vpalignr	xmm8,xmm4,xmm4,8
279	vaesenc	xmm10,xmm10,xmm15
280	vpclmulqdq	xmm4,xmm4,xmm3,0x10
281	vpxor	xmm2,xmm1,XMMWORD[rdi]
282	vaesenc	xmm11,xmm11,xmm15
283	vpxor	xmm0,xmm1,XMMWORD[16+rdi]
284	vaesenc	xmm12,xmm12,xmm15
285	vpxor	xmm5,xmm1,XMMWORD[32+rdi]
286	vaesenc	xmm13,xmm13,xmm15
287	vpxor	xmm6,xmm1,XMMWORD[48+rdi]
288	vaesenc	xmm14,xmm14,xmm15
289	vpxor	xmm7,xmm1,XMMWORD[64+rdi]
290	vpxor	xmm3,xmm1,XMMWORD[80+rdi]
291	vmovdqu	xmm1,XMMWORD[r8]
292
293	vaesenclast	xmm9,xmm9,xmm2
294	vmovdqu	xmm2,XMMWORD[32+r11]
295	vaesenclast	xmm10,xmm10,xmm0
296	vpaddb	xmm0,xmm1,xmm2
297	mov	QWORD[((112+8))+rsp],r13
298	lea	rdi,[96+rdi]
299	vaesenclast	xmm11,xmm11,xmm5
300	vpaddb	xmm5,xmm0,xmm2
301	mov	QWORD[((120+8))+rsp],r12
302	lea	rsi,[96+rsi]
303	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
304	vaesenclast	xmm12,xmm12,xmm6
305	vpaddb	xmm6,xmm5,xmm2
306	vaesenclast	xmm13,xmm13,xmm7
307	vpaddb	xmm7,xmm6,xmm2
308	vaesenclast	xmm14,xmm14,xmm3
309	vpaddb	xmm3,xmm7,xmm2
310
311	add	r10,0x60
312	sub	rdx,0x6
313	jc	NEAR $L$6x_done
314
315	vmovups	XMMWORD[(-96)+rsi],xmm9
316	vpxor	xmm9,xmm1,xmm15
317	vmovups	XMMWORD[(-80)+rsi],xmm10
318	vmovdqa	xmm10,xmm0
319	vmovups	XMMWORD[(-64)+rsi],xmm11
320	vmovdqa	xmm11,xmm5
321	vmovups	XMMWORD[(-48)+rsi],xmm12
322	vmovdqa	xmm12,xmm6
323	vmovups	XMMWORD[(-32)+rsi],xmm13
324	vmovdqa	xmm13,xmm7
325	vmovups	XMMWORD[(-16)+rsi],xmm14
326	vmovdqa	xmm14,xmm3
327	vmovdqu	xmm7,XMMWORD[((32+8))+rsp]
328	jmp	NEAR $L$oop6x
329
330$L$6x_done:
331	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
332	vpxor	xmm8,xmm8,xmm4
333
334	DB	0F3h,0C3h		;repret
335
336global	aesni_gcm_decrypt
337
338ALIGN	32
339aesni_gcm_decrypt:
340	mov	QWORD[8+rsp],rdi	;WIN64 prologue
341	mov	QWORD[16+rsp],rsi
342	mov	rax,rsp
343$L$SEH_begin_aesni_gcm_decrypt:
344	mov	rdi,rcx
345	mov	rsi,rdx
346	mov	rdx,r8
347	mov	rcx,r9
348	mov	r8,QWORD[40+rsp]
349	mov	r9,QWORD[48+rsp]
350
351
352	xor	r10,r10
353
354
355
356	cmp	rdx,0x60
357	jb	NEAR $L$gcm_dec_abort
358
359	lea	rax,[rsp]
360	push	rbx
361	push	rbp
362	push	r12
363	push	r13
364	push	r14
365	push	r15
366	lea	rsp,[((-168))+rsp]
367	movaps	XMMWORD[(-216)+rax],xmm6
368	movaps	XMMWORD[(-200)+rax],xmm7
369	movaps	XMMWORD[(-184)+rax],xmm8
370	movaps	XMMWORD[(-168)+rax],xmm9
371	movaps	XMMWORD[(-152)+rax],xmm10
372	movaps	XMMWORD[(-136)+rax],xmm11
373	movaps	XMMWORD[(-120)+rax],xmm12
374	movaps	XMMWORD[(-104)+rax],xmm13
375	movaps	XMMWORD[(-88)+rax],xmm14
376	movaps	XMMWORD[(-72)+rax],xmm15
377$L$gcm_dec_body:
378	vzeroupper
379
380	vmovdqu	xmm1,XMMWORD[r8]
381	add	rsp,-128
382	mov	ebx,DWORD[12+r8]
383	lea	r11,[$L$bswap_mask]
384	lea	r14,[((-128))+rcx]
385	mov	r15,0xf80
386	vmovdqu	xmm8,XMMWORD[r9]
387	and	rsp,-128
388	vmovdqu	xmm0,XMMWORD[r11]
389	lea	rcx,[128+rcx]
390	lea	r9,[((32+32))+r9]
391	mov	ebp,DWORD[((240-128))+rcx]
392	vpshufb	xmm8,xmm8,xmm0
393
394	and	r14,r15
395	and	r15,rsp
396	sub	r15,r14
397	jc	NEAR $L$dec_no_key_aliasing
398	cmp	r15,768
399	jnc	NEAR $L$dec_no_key_aliasing
400	sub	rsp,r15
401$L$dec_no_key_aliasing:
402
403	vmovdqu	xmm7,XMMWORD[80+rdi]
404	lea	r14,[rdi]
405	vmovdqu	xmm4,XMMWORD[64+rdi]
406
407
408
409
410
411
412
413	lea	r15,[((-192))+rdx*1+rdi]
414
415	vmovdqu	xmm5,XMMWORD[48+rdi]
416	shr	rdx,4
417	xor	r10,r10
418	vmovdqu	xmm6,XMMWORD[32+rdi]
419	vpshufb	xmm7,xmm7,xmm0
420	vmovdqu	xmm2,XMMWORD[16+rdi]
421	vpshufb	xmm4,xmm4,xmm0
422	vmovdqu	xmm3,XMMWORD[rdi]
423	vpshufb	xmm5,xmm5,xmm0
424	vmovdqu	XMMWORD[48+rsp],xmm4
425	vpshufb	xmm6,xmm6,xmm0
426	vmovdqu	XMMWORD[64+rsp],xmm5
427	vpshufb	xmm2,xmm2,xmm0
428	vmovdqu	XMMWORD[80+rsp],xmm6
429	vpshufb	xmm3,xmm3,xmm0
430	vmovdqu	XMMWORD[96+rsp],xmm2
431	vmovdqu	XMMWORD[112+rsp],xmm3
432
433	call	_aesni_ctr32_ghash_6x
434
435	vmovups	XMMWORD[(-96)+rsi],xmm9
436	vmovups	XMMWORD[(-80)+rsi],xmm10
437	vmovups	XMMWORD[(-64)+rsi],xmm11
438	vmovups	XMMWORD[(-48)+rsi],xmm12
439	vmovups	XMMWORD[(-32)+rsi],xmm13
440	vmovups	XMMWORD[(-16)+rsi],xmm14
441
442	vpshufb	xmm8,xmm8,XMMWORD[r11]
443	vmovdqu	XMMWORD[(-64)+r9],xmm8
444
445	vzeroupper
446	movaps	xmm6,XMMWORD[((-216))+rax]
447	movaps	xmm7,XMMWORD[((-200))+rax]
448	movaps	xmm8,XMMWORD[((-184))+rax]
449	movaps	xmm9,XMMWORD[((-168))+rax]
450	movaps	xmm10,XMMWORD[((-152))+rax]
451	movaps	xmm11,XMMWORD[((-136))+rax]
452	movaps	xmm12,XMMWORD[((-120))+rax]
453	movaps	xmm13,XMMWORD[((-104))+rax]
454	movaps	xmm14,XMMWORD[((-88))+rax]
455	movaps	xmm15,XMMWORD[((-72))+rax]
456	mov	r15,QWORD[((-48))+rax]
457	mov	r14,QWORD[((-40))+rax]
458	mov	r13,QWORD[((-32))+rax]
459	mov	r12,QWORD[((-24))+rax]
460	mov	rbp,QWORD[((-16))+rax]
461	mov	rbx,QWORD[((-8))+rax]
462	lea	rsp,[rax]
463$L$gcm_dec_abort:
464	mov	rax,r10
465	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
466	mov	rsi,QWORD[16+rsp]
467	DB	0F3h,0C3h		;repret
468$L$SEH_end_aesni_gcm_decrypt:
469
470ALIGN	32
471_aesni_ctr32_6x:
472	vmovdqu	xmm4,XMMWORD[((0-128))+rcx]
473	vmovdqu	xmm2,XMMWORD[32+r11]
474	lea	r13,[((-1))+rbp]
475	vmovups	xmm15,XMMWORD[((16-128))+rcx]
476	lea	r12,[((32-128))+rcx]
477	vpxor	xmm9,xmm1,xmm4
478	add	ebx,100663296
479	jc	NEAR $L$handle_ctr32_2
480	vpaddb	xmm10,xmm1,xmm2
481	vpaddb	xmm11,xmm10,xmm2
482	vpxor	xmm10,xmm10,xmm4
483	vpaddb	xmm12,xmm11,xmm2
484	vpxor	xmm11,xmm11,xmm4
485	vpaddb	xmm13,xmm12,xmm2
486	vpxor	xmm12,xmm12,xmm4
487	vpaddb	xmm14,xmm13,xmm2
488	vpxor	xmm13,xmm13,xmm4
489	vpaddb	xmm1,xmm14,xmm2
490	vpxor	xmm14,xmm14,xmm4
491	jmp	NEAR $L$oop_ctr32
492
493ALIGN	16
494$L$oop_ctr32:
495	vaesenc	xmm9,xmm9,xmm15
496	vaesenc	xmm10,xmm10,xmm15
497	vaesenc	xmm11,xmm11,xmm15
498	vaesenc	xmm12,xmm12,xmm15
499	vaesenc	xmm13,xmm13,xmm15
500	vaesenc	xmm14,xmm14,xmm15
501	vmovups	xmm15,XMMWORD[r12]
502	lea	r12,[16+r12]
503	dec	r13d
504	jnz	NEAR $L$oop_ctr32
505
506	vmovdqu	xmm3,XMMWORD[r12]
507	vaesenc	xmm9,xmm9,xmm15
508	vpxor	xmm4,xmm3,XMMWORD[rdi]
509	vaesenc	xmm10,xmm10,xmm15
510	vpxor	xmm5,xmm3,XMMWORD[16+rdi]
511	vaesenc	xmm11,xmm11,xmm15
512	vpxor	xmm6,xmm3,XMMWORD[32+rdi]
513	vaesenc	xmm12,xmm12,xmm15
514	vpxor	xmm8,xmm3,XMMWORD[48+rdi]
515	vaesenc	xmm13,xmm13,xmm15
516	vpxor	xmm2,xmm3,XMMWORD[64+rdi]
517	vaesenc	xmm14,xmm14,xmm15
518	vpxor	xmm3,xmm3,XMMWORD[80+rdi]
519	lea	rdi,[96+rdi]
520
521	vaesenclast	xmm9,xmm9,xmm4
522	vaesenclast	xmm10,xmm10,xmm5
523	vaesenclast	xmm11,xmm11,xmm6
524	vaesenclast	xmm12,xmm12,xmm8
525	vaesenclast	xmm13,xmm13,xmm2
526	vaesenclast	xmm14,xmm14,xmm3
527	vmovups	XMMWORD[rsi],xmm9
528	vmovups	XMMWORD[16+rsi],xmm10
529	vmovups	XMMWORD[32+rsi],xmm11
530	vmovups	XMMWORD[48+rsi],xmm12
531	vmovups	XMMWORD[64+rsi],xmm13
532	vmovups	XMMWORD[80+rsi],xmm14
533	lea	rsi,[96+rsi]
534
535	DB	0F3h,0C3h		;repret
536ALIGN	32
537$L$handle_ctr32_2:
538	vpshufb	xmm6,xmm1,xmm0
539	vmovdqu	xmm5,XMMWORD[48+r11]
540	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
541	vpaddd	xmm11,xmm6,xmm5
542	vpaddd	xmm12,xmm10,xmm5
543	vpshufb	xmm10,xmm10,xmm0
544	vpaddd	xmm13,xmm11,xmm5
545	vpshufb	xmm11,xmm11,xmm0
546	vpxor	xmm10,xmm10,xmm4
547	vpaddd	xmm14,xmm12,xmm5
548	vpshufb	xmm12,xmm12,xmm0
549	vpxor	xmm11,xmm11,xmm4
550	vpaddd	xmm1,xmm13,xmm5
551	vpshufb	xmm13,xmm13,xmm0
552	vpxor	xmm12,xmm12,xmm4
553	vpshufb	xmm14,xmm14,xmm0
554	vpxor	xmm13,xmm13,xmm4
555	vpshufb	xmm1,xmm1,xmm0
556	vpxor	xmm14,xmm14,xmm4
557	jmp	NEAR $L$oop_ctr32
558
559
560global	aesni_gcm_encrypt
561
562ALIGN	32
563aesni_gcm_encrypt:
564	mov	QWORD[8+rsp],rdi	;WIN64 prologue
565	mov	QWORD[16+rsp],rsi
566	mov	rax,rsp
567$L$SEH_begin_aesni_gcm_encrypt:
568	mov	rdi,rcx
569	mov	rsi,rdx
570	mov	rdx,r8
571	mov	rcx,r9
572	mov	r8,QWORD[40+rsp]
573	mov	r9,QWORD[48+rsp]
574
575
576	xor	r10,r10
577
578
579
580
581	cmp	rdx,0x60*3
582	jb	NEAR $L$gcm_enc_abort
583
584	lea	rax,[rsp]
585	push	rbx
586	push	rbp
587	push	r12
588	push	r13
589	push	r14
590	push	r15
591	lea	rsp,[((-168))+rsp]
592	movaps	XMMWORD[(-216)+rax],xmm6
593	movaps	XMMWORD[(-200)+rax],xmm7
594	movaps	XMMWORD[(-184)+rax],xmm8
595	movaps	XMMWORD[(-168)+rax],xmm9
596	movaps	XMMWORD[(-152)+rax],xmm10
597	movaps	XMMWORD[(-136)+rax],xmm11
598	movaps	XMMWORD[(-120)+rax],xmm12
599	movaps	XMMWORD[(-104)+rax],xmm13
600	movaps	XMMWORD[(-88)+rax],xmm14
601	movaps	XMMWORD[(-72)+rax],xmm15
602$L$gcm_enc_body:
603	vzeroupper
604
605	vmovdqu	xmm1,XMMWORD[r8]
606	add	rsp,-128
607	mov	ebx,DWORD[12+r8]
608	lea	r11,[$L$bswap_mask]
609	lea	r14,[((-128))+rcx]
610	mov	r15,0xf80
611	lea	rcx,[128+rcx]
612	vmovdqu	xmm0,XMMWORD[r11]
613	and	rsp,-128
614	mov	ebp,DWORD[((240-128))+rcx]
615
616	and	r14,r15
617	and	r15,rsp
618	sub	r15,r14
619	jc	NEAR $L$enc_no_key_aliasing
620	cmp	r15,768
621	jnc	NEAR $L$enc_no_key_aliasing
622	sub	rsp,r15
623$L$enc_no_key_aliasing:
624
625	lea	r14,[rsi]
626
627
628
629
630
631
632
633
634	lea	r15,[((-192))+rdx*1+rsi]
635
636	shr	rdx,4
637
638	call	_aesni_ctr32_6x
639	vpshufb	xmm8,xmm9,xmm0
640	vpshufb	xmm2,xmm10,xmm0
641	vmovdqu	XMMWORD[112+rsp],xmm8
642	vpshufb	xmm4,xmm11,xmm0
643	vmovdqu	XMMWORD[96+rsp],xmm2
644	vpshufb	xmm5,xmm12,xmm0
645	vmovdqu	XMMWORD[80+rsp],xmm4
646	vpshufb	xmm6,xmm13,xmm0
647	vmovdqu	XMMWORD[64+rsp],xmm5
648	vpshufb	xmm7,xmm14,xmm0
649	vmovdqu	XMMWORD[48+rsp],xmm6
650
651	call	_aesni_ctr32_6x
652
653	vmovdqu	xmm8,XMMWORD[r9]
654	lea	r9,[((32+32))+r9]
655	sub	rdx,12
656	mov	r10,0x60*2
657	vpshufb	xmm8,xmm8,xmm0
658
659	call	_aesni_ctr32_ghash_6x
660	vmovdqu	xmm7,XMMWORD[32+rsp]
661	vmovdqu	xmm0,XMMWORD[r11]
662	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
663	vpunpckhqdq	xmm1,xmm7,xmm7
664	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
665	vmovups	XMMWORD[(-96)+rsi],xmm9
666	vpshufb	xmm9,xmm9,xmm0
667	vpxor	xmm1,xmm1,xmm7
668	vmovups	XMMWORD[(-80)+rsi],xmm10
669	vpshufb	xmm10,xmm10,xmm0
670	vmovups	XMMWORD[(-64)+rsi],xmm11
671	vpshufb	xmm11,xmm11,xmm0
672	vmovups	XMMWORD[(-48)+rsi],xmm12
673	vpshufb	xmm12,xmm12,xmm0
674	vmovups	XMMWORD[(-32)+rsi],xmm13
675	vpshufb	xmm13,xmm13,xmm0
676	vmovups	XMMWORD[(-16)+rsi],xmm14
677	vpshufb	xmm14,xmm14,xmm0
678	vmovdqu	XMMWORD[16+rsp],xmm9
679	vmovdqu	xmm6,XMMWORD[48+rsp]
680	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
681	vpunpckhqdq	xmm2,xmm6,xmm6
682	vpclmulqdq	xmm5,xmm7,xmm3,0x00
683	vpxor	xmm2,xmm2,xmm6
684	vpclmulqdq	xmm7,xmm7,xmm3,0x11
685	vpclmulqdq	xmm1,xmm1,xmm15,0x00
686
687	vmovdqu	xmm9,XMMWORD[64+rsp]
688	vpclmulqdq	xmm4,xmm6,xmm0,0x00
689	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
690	vpxor	xmm4,xmm4,xmm5
691	vpunpckhqdq	xmm5,xmm9,xmm9
692	vpclmulqdq	xmm6,xmm6,xmm0,0x11
693	vpxor	xmm5,xmm5,xmm9
694	vpxor	xmm6,xmm6,xmm7
695	vpclmulqdq	xmm2,xmm2,xmm15,0x10
696	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
697	vpxor	xmm2,xmm2,xmm1
698
699	vmovdqu	xmm1,XMMWORD[80+rsp]
700	vpclmulqdq	xmm7,xmm9,xmm3,0x00
701	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
702	vpxor	xmm7,xmm7,xmm4
703	vpunpckhqdq	xmm4,xmm1,xmm1
704	vpclmulqdq	xmm9,xmm9,xmm3,0x11
705	vpxor	xmm4,xmm4,xmm1
706	vpxor	xmm9,xmm9,xmm6
707	vpclmulqdq	xmm5,xmm5,xmm15,0x00
708	vpxor	xmm5,xmm5,xmm2
709
710	vmovdqu	xmm2,XMMWORD[96+rsp]
711	vpclmulqdq	xmm6,xmm1,xmm0,0x00
712	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
713	vpxor	xmm6,xmm6,xmm7
714	vpunpckhqdq	xmm7,xmm2,xmm2
715	vpclmulqdq	xmm1,xmm1,xmm0,0x11
716	vpxor	xmm7,xmm7,xmm2
717	vpxor	xmm1,xmm1,xmm9
718	vpclmulqdq	xmm4,xmm4,xmm15,0x10
719	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
720	vpxor	xmm4,xmm4,xmm5
721
722	vpxor	xmm8,xmm8,XMMWORD[112+rsp]
723	vpclmulqdq	xmm5,xmm2,xmm3,0x00
724	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
725	vpunpckhqdq	xmm9,xmm8,xmm8
726	vpxor	xmm5,xmm5,xmm6
727	vpclmulqdq	xmm2,xmm2,xmm3,0x11
728	vpxor	xmm9,xmm9,xmm8
729	vpxor	xmm2,xmm2,xmm1
730	vpclmulqdq	xmm7,xmm7,xmm15,0x00
731	vpxor	xmm4,xmm7,xmm4
732
733	vpclmulqdq	xmm6,xmm8,xmm0,0x00
734	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
735	vpunpckhqdq	xmm1,xmm14,xmm14
736	vpclmulqdq	xmm8,xmm8,xmm0,0x11
737	vpxor	xmm1,xmm1,xmm14
738	vpxor	xmm5,xmm6,xmm5
739	vpclmulqdq	xmm9,xmm9,xmm15,0x10
740	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
741	vpxor	xmm7,xmm8,xmm2
742	vpxor	xmm6,xmm9,xmm4
743
744	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
745	vpxor	xmm9,xmm7,xmm5
746	vpclmulqdq	xmm4,xmm14,xmm3,0x00
747	vpxor	xmm6,xmm6,xmm9
748	vpunpckhqdq	xmm2,xmm13,xmm13
749	vpclmulqdq	xmm14,xmm14,xmm3,0x11
750	vpxor	xmm2,xmm2,xmm13
751	vpslldq	xmm9,xmm6,8
752	vpclmulqdq	xmm1,xmm1,xmm15,0x00
753	vpxor	xmm8,xmm5,xmm9
754	vpsrldq	xmm6,xmm6,8
755	vpxor	xmm7,xmm7,xmm6
756
757	vpclmulqdq	xmm5,xmm13,xmm0,0x00
758	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
759	vpxor	xmm5,xmm5,xmm4
760	vpunpckhqdq	xmm9,xmm12,xmm12
761	vpclmulqdq	xmm13,xmm13,xmm0,0x11
762	vpxor	xmm9,xmm9,xmm12
763	vpxor	xmm13,xmm13,xmm14
764	vpalignr	xmm14,xmm8,xmm8,8
765	vpclmulqdq	xmm2,xmm2,xmm15,0x10
766	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
767	vpxor	xmm2,xmm2,xmm1
768
769	vpclmulqdq	xmm4,xmm12,xmm3,0x00
770	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
771	vpxor	xmm4,xmm4,xmm5
772	vpunpckhqdq	xmm1,xmm11,xmm11
773	vpclmulqdq	xmm12,xmm12,xmm3,0x11
774	vpxor	xmm1,xmm1,xmm11
775	vpxor	xmm12,xmm12,xmm13
776	vxorps	xmm7,xmm7,XMMWORD[16+rsp]
777	vpclmulqdq	xmm9,xmm9,xmm15,0x00
778	vpxor	xmm9,xmm9,xmm2
779
780	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
781	vxorps	xmm8,xmm8,xmm14
782
783	vpclmulqdq	xmm5,xmm11,xmm0,0x00
784	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
785	vpxor	xmm5,xmm5,xmm4
786	vpunpckhqdq	xmm2,xmm10,xmm10
787	vpclmulqdq	xmm11,xmm11,xmm0,0x11
788	vpxor	xmm2,xmm2,xmm10
789	vpalignr	xmm14,xmm8,xmm8,8
790	vpxor	xmm11,xmm11,xmm12
791	vpclmulqdq	xmm1,xmm1,xmm15,0x10
792	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
793	vpxor	xmm1,xmm1,xmm9
794
795	vxorps	xmm14,xmm14,xmm7
796	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
797	vxorps	xmm8,xmm8,xmm14
798
799	vpclmulqdq	xmm4,xmm10,xmm3,0x00
800	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
801	vpxor	xmm4,xmm4,xmm5
802	vpunpckhqdq	xmm9,xmm8,xmm8
803	vpclmulqdq	xmm10,xmm10,xmm3,0x11
804	vpxor	xmm9,xmm9,xmm8
805	vpxor	xmm10,xmm10,xmm11
806	vpclmulqdq	xmm2,xmm2,xmm15,0x00
807	vpxor	xmm2,xmm2,xmm1
808
809	vpclmulqdq	xmm5,xmm8,xmm0,0x00
810	vpclmulqdq	xmm7,xmm8,xmm0,0x11
811	vpxor	xmm5,xmm5,xmm4
812	vpclmulqdq	xmm6,xmm9,xmm15,0x10
813	vpxor	xmm7,xmm7,xmm10
814	vpxor	xmm6,xmm6,xmm2
815
816	vpxor	xmm4,xmm7,xmm5
817	vpxor	xmm6,xmm6,xmm4
818	vpslldq	xmm1,xmm6,8
819	vmovdqu	xmm3,XMMWORD[16+r11]
820	vpsrldq	xmm6,xmm6,8
821	vpxor	xmm8,xmm5,xmm1
822	vpxor	xmm7,xmm7,xmm6
823
824	vpalignr	xmm2,xmm8,xmm8,8
825	vpclmulqdq	xmm8,xmm8,xmm3,0x10
826	vpxor	xmm8,xmm8,xmm2
827
828	vpalignr	xmm2,xmm8,xmm8,8
829	vpclmulqdq	xmm8,xmm8,xmm3,0x10
830	vpxor	xmm2,xmm2,xmm7
831	vpxor	xmm8,xmm8,xmm2
832	vpshufb	xmm8,xmm8,XMMWORD[r11]
833	vmovdqu	XMMWORD[(-64)+r9],xmm8
834
835	vzeroupper
836	movaps	xmm6,XMMWORD[((-216))+rax]
837	movaps	xmm7,XMMWORD[((-200))+rax]
838	movaps	xmm8,XMMWORD[((-184))+rax]
839	movaps	xmm9,XMMWORD[((-168))+rax]
840	movaps	xmm10,XMMWORD[((-152))+rax]
841	movaps	xmm11,XMMWORD[((-136))+rax]
842	movaps	xmm12,XMMWORD[((-120))+rax]
843	movaps	xmm13,XMMWORD[((-104))+rax]
844	movaps	xmm14,XMMWORD[((-88))+rax]
845	movaps	xmm15,XMMWORD[((-72))+rax]
846	mov	r15,QWORD[((-48))+rax]
847	mov	r14,QWORD[((-40))+rax]
848	mov	r13,QWORD[((-32))+rax]
849	mov	r12,QWORD[((-24))+rax]
850	mov	rbp,QWORD[((-16))+rax]
851	mov	rbx,QWORD[((-8))+rax]
852	lea	rsp,[rax]
853$L$gcm_enc_abort:
854	mov	rax,r10
855	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
856	mov	rsi,QWORD[16+rsp]
857	DB	0F3h,0C3h		;repret
858$L$SEH_end_aesni_gcm_encrypt:
859ALIGN	64
860$L$bswap_mask:
861DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
862$L$poly:
863DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
864$L$one_msb:
865DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
866$L$two_lsb:
867DB	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
868$L$one_lsb:
869DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
870DB	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
871DB	101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
872DB	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
873DB	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
874ALIGN	64
875EXTERN	__imp_RtlVirtualUnwind
876
877ALIGN	16
878gcm_se_handler:
879	push	rsi
880	push	rdi
881	push	rbx
882	push	rbp
883	push	r12
884	push	r13
885	push	r14
886	push	r15
887	pushfq
888	sub	rsp,64
889
890	mov	rax,QWORD[120+r8]
891	mov	rbx,QWORD[248+r8]
892
893	mov	rsi,QWORD[8+r9]
894	mov	r11,QWORD[56+r9]
895
896	mov	r10d,DWORD[r11]
897	lea	r10,[r10*1+rsi]
898	cmp	rbx,r10
899	jb	NEAR $L$common_seh_tail
900
901	mov	rax,QWORD[152+r8]
902
903	mov	r10d,DWORD[4+r11]
904	lea	r10,[r10*1+rsi]
905	cmp	rbx,r10
906	jae	NEAR $L$common_seh_tail
907
908	mov	rax,QWORD[120+r8]
909
910	mov	r15,QWORD[((-48))+rax]
911	mov	r14,QWORD[((-40))+rax]
912	mov	r13,QWORD[((-32))+rax]
913	mov	r12,QWORD[((-24))+rax]
914	mov	rbp,QWORD[((-16))+rax]
915	mov	rbx,QWORD[((-8))+rax]
916	mov	QWORD[240+r8],r15
917	mov	QWORD[232+r8],r14
918	mov	QWORD[224+r8],r13
919	mov	QWORD[216+r8],r12
920	mov	QWORD[160+r8],rbp
921	mov	QWORD[144+r8],rbx
922
923	lea	rsi,[((-216))+rax]
924	lea	rdi,[512+r8]
925	mov	ecx,20
926	DD	0xa548f3fc
927
928$L$common_seh_tail:
929	mov	rdi,QWORD[8+rax]
930	mov	rsi,QWORD[16+rax]
931	mov	QWORD[152+r8],rax
932	mov	QWORD[168+r8],rsi
933	mov	QWORD[176+r8],rdi
934
935	mov	rdi,QWORD[40+r9]
936	mov	rsi,r8
937	mov	ecx,154
938	DD	0xa548f3fc
939
940	mov	rsi,r9
941	xor	rcx,rcx
942	mov	rdx,QWORD[8+rsi]
943	mov	r8,QWORD[rsi]
944	mov	r9,QWORD[16+rsi]
945	mov	r10,QWORD[40+rsi]
946	lea	r11,[56+rsi]
947	lea	r12,[24+rsi]
948	mov	QWORD[32+rsp],r10
949	mov	QWORD[40+rsp],r11
950	mov	QWORD[48+rsp],r12
951	mov	QWORD[56+rsp],rcx
952	call	QWORD[__imp_RtlVirtualUnwind]
953
954	mov	eax,1
955	add	rsp,64
956	popfq
957	pop	r15
958	pop	r14
959	pop	r13
960	pop	r12
961	pop	rbp
962	pop	rbx
963	pop	rdi
964	pop	rsi
965	DB	0F3h,0C3h		;repret
966
967
968section	.pdata rdata align=4
969ALIGN	4
970	DD	$L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase
971	DD	$L$SEH_end_aesni_gcm_decrypt wrt ..imagebase
972	DD	$L$SEH_gcm_dec_info wrt ..imagebase
973
974	DD	$L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase
975	DD	$L$SEH_end_aesni_gcm_encrypt wrt ..imagebase
976	DD	$L$SEH_gcm_enc_info wrt ..imagebase
977section	.xdata rdata align=8
978ALIGN	8
979$L$SEH_gcm_dec_info:
980DB	9,0,0,0
981	DD	gcm_se_handler wrt ..imagebase
982	DD	$L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase
983$L$SEH_gcm_enc_info:
984DB	9,0,0,0
985	DD	gcm_se_handler wrt ..imagebase
986	DD	$L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase
987