• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5EXTERN	OPENSSL_ia32cap_P
6global	ossl_rsaz_avx512ifma_eligible
7
8ALIGN	32
9ossl_rsaz_avx512ifma_eligible:
10	mov	ecx,DWORD[((OPENSSL_ia32cap_P+8))]
11	xor	eax,eax
12	and	ecx,2149777408
13	cmp	ecx,2149777408
14	cmove	eax,ecx
15	DB	0F3h,0C3h		;repret
16
17section	.text code align=64
18
19
20global	ossl_rsaz_amm52x20_x1_256
21
22ALIGN	32
23ossl_rsaz_amm52x20_x1_256:
24	mov	QWORD[8+rsp],rdi	;WIN64 prologue
25	mov	QWORD[16+rsp],rsi
26	mov	rax,rsp
27$L$SEH_begin_ossl_rsaz_amm52x20_x1_256:
28	mov	rdi,rcx
29	mov	rsi,rdx
30	mov	rdx,r8
31	mov	rcx,r9
32	mov	r8,QWORD[40+rsp]
33
34
35
36DB	243,15,30,250
37	push	rbx
38
39	push	rbp
40
41	push	r12
42
43	push	r13
44
45	push	r14
46
47	push	r15
48
49$L$rsaz_amm52x20_x1_256_body:
50
51
52	vpxord	ymm0,ymm0,ymm0
53	vmovdqa64	ymm1,ymm0
54	vmovdqa64	ymm16,ymm0
55	vmovdqa64	ymm17,ymm0
56	vmovdqa64	ymm18,ymm0
57	vmovdqa64	ymm19,ymm0
58
59	xor	r9d,r9d
60
61	mov	r11,rdx
62	mov	rax,0xfffffffffffff
63
64
65	mov	ebx,5
66
67ALIGN	32
68$L$loop5:
69	mov	r13,QWORD[r11]
70
71	vpbroadcastq	ymm3,r13
72	mov	rdx,QWORD[rsi]
73	mulx	r12,r13,r13
74	add	r9,r13
75	mov	r10,r12
76	adc	r10,0
77
78	mov	r13,r8
79	imul	r13,r9
80	and	r13,rax
81
82	vpbroadcastq	ymm4,r13
83	mov	rdx,QWORD[rcx]
84	mulx	r12,r13,r13
85	add	r9,r13
86	adc	r10,r12
87
88	shr	r9,52
89	sal	r10,12
90	or	r9,r10
91
92	vpmadd52luq	ymm1,ymm3,YMMWORD[rsi]
93	vpmadd52luq	ymm16,ymm3,YMMWORD[32+rsi]
94	vpmadd52luq	ymm17,ymm3,YMMWORD[64+rsi]
95	vpmadd52luq	ymm18,ymm3,YMMWORD[96+rsi]
96	vpmadd52luq	ymm19,ymm3,YMMWORD[128+rsi]
97
98	vpmadd52luq	ymm1,ymm4,YMMWORD[rcx]
99	vpmadd52luq	ymm16,ymm4,YMMWORD[32+rcx]
100	vpmadd52luq	ymm17,ymm4,YMMWORD[64+rcx]
101	vpmadd52luq	ymm18,ymm4,YMMWORD[96+rcx]
102	vpmadd52luq	ymm19,ymm4,YMMWORD[128+rcx]
103
104
105	valignq	ymm1,ymm16,ymm1,1
106	valignq	ymm16,ymm17,ymm16,1
107	valignq	ymm17,ymm18,ymm17,1
108	valignq	ymm18,ymm19,ymm18,1
109	valignq	ymm19,ymm0,ymm19,1
110
111	vmovq	r13,xmm1
112	add	r9,r13
113
114	vpmadd52huq	ymm1,ymm3,YMMWORD[rsi]
115	vpmadd52huq	ymm16,ymm3,YMMWORD[32+rsi]
116	vpmadd52huq	ymm17,ymm3,YMMWORD[64+rsi]
117	vpmadd52huq	ymm18,ymm3,YMMWORD[96+rsi]
118	vpmadd52huq	ymm19,ymm3,YMMWORD[128+rsi]
119
120	vpmadd52huq	ymm1,ymm4,YMMWORD[rcx]
121	vpmadd52huq	ymm16,ymm4,YMMWORD[32+rcx]
122	vpmadd52huq	ymm17,ymm4,YMMWORD[64+rcx]
123	vpmadd52huq	ymm18,ymm4,YMMWORD[96+rcx]
124	vpmadd52huq	ymm19,ymm4,YMMWORD[128+rcx]
125	mov	r13,QWORD[8+r11]
126
127	vpbroadcastq	ymm3,r13
128	mov	rdx,QWORD[rsi]
129	mulx	r12,r13,r13
130	add	r9,r13
131	mov	r10,r12
132	adc	r10,0
133
134	mov	r13,r8
135	imul	r13,r9
136	and	r13,rax
137
138	vpbroadcastq	ymm4,r13
139	mov	rdx,QWORD[rcx]
140	mulx	r12,r13,r13
141	add	r9,r13
142	adc	r10,r12
143
144	shr	r9,52
145	sal	r10,12
146	or	r9,r10
147
148	vpmadd52luq	ymm1,ymm3,YMMWORD[rsi]
149	vpmadd52luq	ymm16,ymm3,YMMWORD[32+rsi]
150	vpmadd52luq	ymm17,ymm3,YMMWORD[64+rsi]
151	vpmadd52luq	ymm18,ymm3,YMMWORD[96+rsi]
152	vpmadd52luq	ymm19,ymm3,YMMWORD[128+rsi]
153
154	vpmadd52luq	ymm1,ymm4,YMMWORD[rcx]
155	vpmadd52luq	ymm16,ymm4,YMMWORD[32+rcx]
156	vpmadd52luq	ymm17,ymm4,YMMWORD[64+rcx]
157	vpmadd52luq	ymm18,ymm4,YMMWORD[96+rcx]
158	vpmadd52luq	ymm19,ymm4,YMMWORD[128+rcx]
159
160
161	valignq	ymm1,ymm16,ymm1,1
162	valignq	ymm16,ymm17,ymm16,1
163	valignq	ymm17,ymm18,ymm17,1
164	valignq	ymm18,ymm19,ymm18,1
165	valignq	ymm19,ymm0,ymm19,1
166
167	vmovq	r13,xmm1
168	add	r9,r13
169
170	vpmadd52huq	ymm1,ymm3,YMMWORD[rsi]
171	vpmadd52huq	ymm16,ymm3,YMMWORD[32+rsi]
172	vpmadd52huq	ymm17,ymm3,YMMWORD[64+rsi]
173	vpmadd52huq	ymm18,ymm3,YMMWORD[96+rsi]
174	vpmadd52huq	ymm19,ymm3,YMMWORD[128+rsi]
175
176	vpmadd52huq	ymm1,ymm4,YMMWORD[rcx]
177	vpmadd52huq	ymm16,ymm4,YMMWORD[32+rcx]
178	vpmadd52huq	ymm17,ymm4,YMMWORD[64+rcx]
179	vpmadd52huq	ymm18,ymm4,YMMWORD[96+rcx]
180	vpmadd52huq	ymm19,ymm4,YMMWORD[128+rcx]
181	mov	r13,QWORD[16+r11]
182
183	vpbroadcastq	ymm3,r13
184	mov	rdx,QWORD[rsi]
185	mulx	r12,r13,r13
186	add	r9,r13
187	mov	r10,r12
188	adc	r10,0
189
190	mov	r13,r8
191	imul	r13,r9
192	and	r13,rax
193
194	vpbroadcastq	ymm4,r13
195	mov	rdx,QWORD[rcx]
196	mulx	r12,r13,r13
197	add	r9,r13
198	adc	r10,r12
199
200	shr	r9,52
201	sal	r10,12
202	or	r9,r10
203
204	vpmadd52luq	ymm1,ymm3,YMMWORD[rsi]
205	vpmadd52luq	ymm16,ymm3,YMMWORD[32+rsi]
206	vpmadd52luq	ymm17,ymm3,YMMWORD[64+rsi]
207	vpmadd52luq	ymm18,ymm3,YMMWORD[96+rsi]
208	vpmadd52luq	ymm19,ymm3,YMMWORD[128+rsi]
209
210	vpmadd52luq	ymm1,ymm4,YMMWORD[rcx]
211	vpmadd52luq	ymm16,ymm4,YMMWORD[32+rcx]
212	vpmadd52luq	ymm17,ymm4,YMMWORD[64+rcx]
213	vpmadd52luq	ymm18,ymm4,YMMWORD[96+rcx]
214	vpmadd52luq	ymm19,ymm4,YMMWORD[128+rcx]
215
216
217	valignq	ymm1,ymm16,ymm1,1
218	valignq	ymm16,ymm17,ymm16,1
219	valignq	ymm17,ymm18,ymm17,1
220	valignq	ymm18,ymm19,ymm18,1
221	valignq	ymm19,ymm0,ymm19,1
222
223	vmovq	r13,xmm1
224	add	r9,r13
225
226	vpmadd52huq	ymm1,ymm3,YMMWORD[rsi]
227	vpmadd52huq	ymm16,ymm3,YMMWORD[32+rsi]
228	vpmadd52huq	ymm17,ymm3,YMMWORD[64+rsi]
229	vpmadd52huq	ymm18,ymm3,YMMWORD[96+rsi]
230	vpmadd52huq	ymm19,ymm3,YMMWORD[128+rsi]
231
232	vpmadd52huq	ymm1,ymm4,YMMWORD[rcx]
233	vpmadd52huq	ymm16,ymm4,YMMWORD[32+rcx]
234	vpmadd52huq	ymm17,ymm4,YMMWORD[64+rcx]
235	vpmadd52huq	ymm18,ymm4,YMMWORD[96+rcx]
236	vpmadd52huq	ymm19,ymm4,YMMWORD[128+rcx]
237	mov	r13,QWORD[24+r11]
238
239	vpbroadcastq	ymm3,r13
240	mov	rdx,QWORD[rsi]
241	mulx	r12,r13,r13
242	add	r9,r13
243	mov	r10,r12
244	adc	r10,0
245
246	mov	r13,r8
247	imul	r13,r9
248	and	r13,rax
249
250	vpbroadcastq	ymm4,r13
251	mov	rdx,QWORD[rcx]
252	mulx	r12,r13,r13
253	add	r9,r13
254	adc	r10,r12
255
256	shr	r9,52
257	sal	r10,12
258	or	r9,r10
259
260	vpmadd52luq	ymm1,ymm3,YMMWORD[rsi]
261	vpmadd52luq	ymm16,ymm3,YMMWORD[32+rsi]
262	vpmadd52luq	ymm17,ymm3,YMMWORD[64+rsi]
263	vpmadd52luq	ymm18,ymm3,YMMWORD[96+rsi]
264	vpmadd52luq	ymm19,ymm3,YMMWORD[128+rsi]
265
266	vpmadd52luq	ymm1,ymm4,YMMWORD[rcx]
267	vpmadd52luq	ymm16,ymm4,YMMWORD[32+rcx]
268	vpmadd52luq	ymm17,ymm4,YMMWORD[64+rcx]
269	vpmadd52luq	ymm18,ymm4,YMMWORD[96+rcx]
270	vpmadd52luq	ymm19,ymm4,YMMWORD[128+rcx]
271
272
273	valignq	ymm1,ymm16,ymm1,1
274	valignq	ymm16,ymm17,ymm16,1
275	valignq	ymm17,ymm18,ymm17,1
276	valignq	ymm18,ymm19,ymm18,1
277	valignq	ymm19,ymm0,ymm19,1
278
279	vmovq	r13,xmm1
280	add	r9,r13
281
282	vpmadd52huq	ymm1,ymm3,YMMWORD[rsi]
283	vpmadd52huq	ymm16,ymm3,YMMWORD[32+rsi]
284	vpmadd52huq	ymm17,ymm3,YMMWORD[64+rsi]
285	vpmadd52huq	ymm18,ymm3,YMMWORD[96+rsi]
286	vpmadd52huq	ymm19,ymm3,YMMWORD[128+rsi]
287
288	vpmadd52huq	ymm1,ymm4,YMMWORD[rcx]
289	vpmadd52huq	ymm16,ymm4,YMMWORD[32+rcx]
290	vpmadd52huq	ymm17,ymm4,YMMWORD[64+rcx]
291	vpmadd52huq	ymm18,ymm4,YMMWORD[96+rcx]
292	vpmadd52huq	ymm19,ymm4,YMMWORD[128+rcx]
293	lea	r11,[32+r11]
294	dec	ebx
295	jne	NEAR $L$loop5
296
297	vmovdqa64	ymm4,YMMWORD[$L$mask52x4]
298
299	vpbroadcastq	ymm3,r9
300	vpblendd	ymm1,ymm1,ymm3,3
301
302
303
304	vpsrlq	ymm24,ymm1,52
305	vpsrlq	ymm25,ymm16,52
306	vpsrlq	ymm26,ymm17,52
307	vpsrlq	ymm27,ymm18,52
308	vpsrlq	ymm28,ymm19,52
309
310
311	valignq	ymm28,ymm28,ymm27,3
312	valignq	ymm27,ymm27,ymm26,3
313	valignq	ymm26,ymm26,ymm25,3
314	valignq	ymm25,ymm25,ymm24,3
315	valignq	ymm24,ymm24,ymm0,3
316
317
318	vpandq	ymm1,ymm1,ymm4
319	vpandq	ymm16,ymm16,ymm4
320	vpandq	ymm17,ymm17,ymm4
321	vpandq	ymm18,ymm18,ymm4
322	vpandq	ymm19,ymm19,ymm4
323
324
325	vpaddq	ymm1,ymm1,ymm24
326	vpaddq	ymm16,ymm16,ymm25
327	vpaddq	ymm17,ymm17,ymm26
328	vpaddq	ymm18,ymm18,ymm27
329	vpaddq	ymm19,ymm19,ymm28
330
331
332
333	vpcmpuq	k1,ymm4,ymm1,1
334	vpcmpuq	k2,ymm4,ymm16,1
335	vpcmpuq	k3,ymm4,ymm17,1
336	vpcmpuq	k4,ymm4,ymm18,1
337	vpcmpuq	k5,ymm4,ymm19,1
338	kmovb	r14d,k1
339	kmovb	r13d,k2
340	kmovb	r12d,k3
341	kmovb	r11d,k4
342	kmovb	r10d,k5
343
344
345	vpcmpuq	k1,ymm4,ymm1,0
346	vpcmpuq	k2,ymm4,ymm16,0
347	vpcmpuq	k3,ymm4,ymm17,0
348	vpcmpuq	k4,ymm4,ymm18,0
349	vpcmpuq	k5,ymm4,ymm19,0
350	kmovb	r9d,k1
351	kmovb	r8d,k2
352	kmovb	ebx,k3
353	kmovb	ecx,k4
354	kmovb	edx,k5
355
356
357
358	shl	r13b,4
359	or	r14b,r13b
360	shl	r11b,4
361	or	r12b,r11b
362
363	add	r14b,r14b
364	adc	r12b,r12b
365	adc	r10b,r10b
366
367	shl	r8b,4
368	or	r9b,r8b
369	shl	cl,4
370	or	bl,cl
371
372	add	r14b,r9b
373	adc	r12b,bl
374	adc	r10b,dl
375
376	xor	r14b,r9b
377	xor	r12b,bl
378	xor	r10b,dl
379
380	kmovb	k1,r14d
381	shr	r14b,4
382	kmovb	k2,r14d
383	kmovb	k3,r12d
384	shr	r12b,4
385	kmovb	k4,r12d
386	kmovb	k5,r10d
387
388
389	vpsubq	ymm1{k1},ymm1,ymm4
390	vpsubq	ymm16{k2},ymm16,ymm4
391	vpsubq	ymm17{k3},ymm17,ymm4
392	vpsubq	ymm18{k4},ymm18,ymm4
393	vpsubq	ymm19{k5},ymm19,ymm4
394
395	vpandq	ymm1,ymm1,ymm4
396	vpandq	ymm16,ymm16,ymm4
397	vpandq	ymm17,ymm17,ymm4
398	vpandq	ymm18,ymm18,ymm4
399	vpandq	ymm19,ymm19,ymm4
400
401	vmovdqu64	YMMWORD[rdi],ymm1
402	vmovdqu64	YMMWORD[32+rdi],ymm16
403	vmovdqu64	YMMWORD[64+rdi],ymm17
404	vmovdqu64	YMMWORD[96+rdi],ymm18
405	vmovdqu64	YMMWORD[128+rdi],ymm19
406
407	vzeroupper
408	mov	r15,QWORD[rsp]
409
410	mov	r14,QWORD[8+rsp]
411
412	mov	r13,QWORD[16+rsp]
413
414	mov	r12,QWORD[24+rsp]
415
416	mov	rbp,QWORD[32+rsp]
417
418	mov	rbx,QWORD[40+rsp]
419
420	lea	rsp,[48+rsp]
421
422$L$rsaz_amm52x20_x1_256_epilogue:
423	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
424	mov	rsi,QWORD[16+rsp]
425	DB	0F3h,0C3h		;repret
426
427$L$SEH_end_ossl_rsaz_amm52x20_x1_256:
428section	.data data align=8
429
430ALIGN	32
431$L$mask52x4:
432	DQ	0xfffffffffffff
433	DQ	0xfffffffffffff
434	DQ	0xfffffffffffff
435	DQ	0xfffffffffffff
436section	.text code align=64
437
438
439global	ossl_rsaz_amm52x20_x2_256
440
441ALIGN	32
442ossl_rsaz_amm52x20_x2_256:
443	mov	QWORD[8+rsp],rdi	;WIN64 prologue
444	mov	QWORD[16+rsp],rsi
445	mov	rax,rsp
446$L$SEH_begin_ossl_rsaz_amm52x20_x2_256:
447	mov	rdi,rcx
448	mov	rsi,rdx
449	mov	rdx,r8
450	mov	rcx,r9
451	mov	r8,QWORD[40+rsp]
452
453
454
455DB	243,15,30,250
456	push	rbx
457
458	push	rbp
459
460	push	r12
461
462	push	r13
463
464	push	r14
465
466	push	r15
467
468$L$rsaz_amm52x20_x2_256_body:
469
470
471	vpxord	ymm0,ymm0,ymm0
472	vmovdqa64	ymm1,ymm0
473	vmovdqa64	ymm16,ymm0
474	vmovdqa64	ymm17,ymm0
475	vmovdqa64	ymm18,ymm0
476	vmovdqa64	ymm19,ymm0
477	vmovdqa64	ymm2,ymm0
478	vmovdqa64	ymm20,ymm0
479	vmovdqa64	ymm21,ymm0
480	vmovdqa64	ymm22,ymm0
481	vmovdqa64	ymm23,ymm0
482
483	xor	r9d,r9d
484	xor	r15d,r15d
485
486	mov	r11,rdx
487	mov	rax,0xfffffffffffff
488
489	mov	ebx,20
490
491ALIGN	32
492$L$loop20:
493	mov	r13,QWORD[r11]
494
495	vpbroadcastq	ymm3,r13
496	mov	rdx,QWORD[rsi]
497	mulx	r12,r13,r13
498	add	r9,r13
499	mov	r10,r12
500	adc	r10,0
501
502	mov	r13,QWORD[r8]
503	imul	r13,r9
504	and	r13,rax
505
506	vpbroadcastq	ymm4,r13
507	mov	rdx,QWORD[rcx]
508	mulx	r12,r13,r13
509	add	r9,r13
510	adc	r10,r12
511
512	shr	r9,52
513	sal	r10,12
514	or	r9,r10
515
516	vpmadd52luq	ymm1,ymm3,YMMWORD[rsi]
517	vpmadd52luq	ymm16,ymm3,YMMWORD[32+rsi]
518	vpmadd52luq	ymm17,ymm3,YMMWORD[64+rsi]
519	vpmadd52luq	ymm18,ymm3,YMMWORD[96+rsi]
520	vpmadd52luq	ymm19,ymm3,YMMWORD[128+rsi]
521
522	vpmadd52luq	ymm1,ymm4,YMMWORD[rcx]
523	vpmadd52luq	ymm16,ymm4,YMMWORD[32+rcx]
524	vpmadd52luq	ymm17,ymm4,YMMWORD[64+rcx]
525	vpmadd52luq	ymm18,ymm4,YMMWORD[96+rcx]
526	vpmadd52luq	ymm19,ymm4,YMMWORD[128+rcx]
527
528
529	valignq	ymm1,ymm16,ymm1,1
530	valignq	ymm16,ymm17,ymm16,1
531	valignq	ymm17,ymm18,ymm17,1
532	valignq	ymm18,ymm19,ymm18,1
533	valignq	ymm19,ymm0,ymm19,1
534
535	vmovq	r13,xmm1
536	add	r9,r13
537
538	vpmadd52huq	ymm1,ymm3,YMMWORD[rsi]
539	vpmadd52huq	ymm16,ymm3,YMMWORD[32+rsi]
540	vpmadd52huq	ymm17,ymm3,YMMWORD[64+rsi]
541	vpmadd52huq	ymm18,ymm3,YMMWORD[96+rsi]
542	vpmadd52huq	ymm19,ymm3,YMMWORD[128+rsi]
543
544	vpmadd52huq	ymm1,ymm4,YMMWORD[rcx]
545	vpmadd52huq	ymm16,ymm4,YMMWORD[32+rcx]
546	vpmadd52huq	ymm17,ymm4,YMMWORD[64+rcx]
547	vpmadd52huq	ymm18,ymm4,YMMWORD[96+rcx]
548	vpmadd52huq	ymm19,ymm4,YMMWORD[128+rcx]
549	mov	r13,QWORD[160+r11]
550
551	vpbroadcastq	ymm3,r13
552	mov	rdx,QWORD[160+rsi]
553	mulx	r12,r13,r13
554	add	r15,r13
555	mov	r10,r12
556	adc	r10,0
557
558	mov	r13,QWORD[8+r8]
559	imul	r13,r15
560	and	r13,rax
561
562	vpbroadcastq	ymm4,r13
563	mov	rdx,QWORD[160+rcx]
564	mulx	r12,r13,r13
565	add	r15,r13
566	adc	r10,r12
567
568	shr	r15,52
569	sal	r10,12
570	or	r15,r10
571
572	vpmadd52luq	ymm2,ymm3,YMMWORD[160+rsi]
573	vpmadd52luq	ymm20,ymm3,YMMWORD[192+rsi]
574	vpmadd52luq	ymm21,ymm3,YMMWORD[224+rsi]
575	vpmadd52luq	ymm22,ymm3,YMMWORD[256+rsi]
576	vpmadd52luq	ymm23,ymm3,YMMWORD[288+rsi]
577
578	vpmadd52luq	ymm2,ymm4,YMMWORD[160+rcx]
579	vpmadd52luq	ymm20,ymm4,YMMWORD[192+rcx]
580	vpmadd52luq	ymm21,ymm4,YMMWORD[224+rcx]
581	vpmadd52luq	ymm22,ymm4,YMMWORD[256+rcx]
582	vpmadd52luq	ymm23,ymm4,YMMWORD[288+rcx]
583
584
585	valignq	ymm2,ymm20,ymm2,1
586	valignq	ymm20,ymm21,ymm20,1
587	valignq	ymm21,ymm22,ymm21,1
588	valignq	ymm22,ymm23,ymm22,1
589	valignq	ymm23,ymm0,ymm23,1
590
591	vmovq	r13,xmm2
592	add	r15,r13
593
594	vpmadd52huq	ymm2,ymm3,YMMWORD[160+rsi]
595	vpmadd52huq	ymm20,ymm3,YMMWORD[192+rsi]
596	vpmadd52huq	ymm21,ymm3,YMMWORD[224+rsi]
597	vpmadd52huq	ymm22,ymm3,YMMWORD[256+rsi]
598	vpmadd52huq	ymm23,ymm3,YMMWORD[288+rsi]
599
600	vpmadd52huq	ymm2,ymm4,YMMWORD[160+rcx]
601	vpmadd52huq	ymm20,ymm4,YMMWORD[192+rcx]
602	vpmadd52huq	ymm21,ymm4,YMMWORD[224+rcx]
603	vpmadd52huq	ymm22,ymm4,YMMWORD[256+rcx]
604	vpmadd52huq	ymm23,ymm4,YMMWORD[288+rcx]
605	lea	r11,[8+r11]
606	dec	ebx
607	jne	NEAR $L$loop20
608
609	vmovdqa64	ymm4,YMMWORD[$L$mask52x4]
610
611	vpbroadcastq	ymm3,r9
612	vpblendd	ymm1,ymm1,ymm3,3
613
614
615
616	vpsrlq	ymm24,ymm1,52
617	vpsrlq	ymm25,ymm16,52
618	vpsrlq	ymm26,ymm17,52
619	vpsrlq	ymm27,ymm18,52
620	vpsrlq	ymm28,ymm19,52
621
622
623	valignq	ymm28,ymm28,ymm27,3
624	valignq	ymm27,ymm27,ymm26,3
625	valignq	ymm26,ymm26,ymm25,3
626	valignq	ymm25,ymm25,ymm24,3
627	valignq	ymm24,ymm24,ymm0,3
628
629
630	vpandq	ymm1,ymm1,ymm4
631	vpandq	ymm16,ymm16,ymm4
632	vpandq	ymm17,ymm17,ymm4
633	vpandq	ymm18,ymm18,ymm4
634	vpandq	ymm19,ymm19,ymm4
635
636
637	vpaddq	ymm1,ymm1,ymm24
638	vpaddq	ymm16,ymm16,ymm25
639	vpaddq	ymm17,ymm17,ymm26
640	vpaddq	ymm18,ymm18,ymm27
641	vpaddq	ymm19,ymm19,ymm28
642
643
644
645	vpcmpuq	k1,ymm4,ymm1,1
646	vpcmpuq	k2,ymm4,ymm16,1
647	vpcmpuq	k3,ymm4,ymm17,1
648	vpcmpuq	k4,ymm4,ymm18,1
649	vpcmpuq	k5,ymm4,ymm19,1
650	kmovb	r14d,k1
651	kmovb	r13d,k2
652	kmovb	r12d,k3
653	kmovb	r11d,k4
654	kmovb	r10d,k5
655
656
657	vpcmpuq	k1,ymm4,ymm1,0
658	vpcmpuq	k2,ymm4,ymm16,0
659	vpcmpuq	k3,ymm4,ymm17,0
660	vpcmpuq	k4,ymm4,ymm18,0
661	vpcmpuq	k5,ymm4,ymm19,0
662	kmovb	r9d,k1
663	kmovb	r8d,k2
664	kmovb	ebx,k3
665	kmovb	ecx,k4
666	kmovb	edx,k5
667
668
669
670	shl	r13b,4
671	or	r14b,r13b
672	shl	r11b,4
673	or	r12b,r11b
674
675	add	r14b,r14b
676	adc	r12b,r12b
677	adc	r10b,r10b
678
679	shl	r8b,4
680	or	r9b,r8b
681	shl	cl,4
682	or	bl,cl
683
684	add	r14b,r9b
685	adc	r12b,bl
686	adc	r10b,dl
687
688	xor	r14b,r9b
689	xor	r12b,bl
690	xor	r10b,dl
691
692	kmovb	k1,r14d
693	shr	r14b,4
694	kmovb	k2,r14d
695	kmovb	k3,r12d
696	shr	r12b,4
697	kmovb	k4,r12d
698	kmovb	k5,r10d
699
700
701	vpsubq	ymm1{k1},ymm1,ymm4
702	vpsubq	ymm16{k2},ymm16,ymm4
703	vpsubq	ymm17{k3},ymm17,ymm4
704	vpsubq	ymm18{k4},ymm18,ymm4
705	vpsubq	ymm19{k5},ymm19,ymm4
706
707	vpandq	ymm1,ymm1,ymm4
708	vpandq	ymm16,ymm16,ymm4
709	vpandq	ymm17,ymm17,ymm4
710	vpandq	ymm18,ymm18,ymm4
711	vpandq	ymm19,ymm19,ymm4
712
713	vpbroadcastq	ymm3,r15
714	vpblendd	ymm2,ymm2,ymm3,3
715
716
717
718	vpsrlq	ymm24,ymm2,52
719	vpsrlq	ymm25,ymm20,52
720	vpsrlq	ymm26,ymm21,52
721	vpsrlq	ymm27,ymm22,52
722	vpsrlq	ymm28,ymm23,52
723
724
725	valignq	ymm28,ymm28,ymm27,3
726	valignq	ymm27,ymm27,ymm26,3
727	valignq	ymm26,ymm26,ymm25,3
728	valignq	ymm25,ymm25,ymm24,3
729	valignq	ymm24,ymm24,ymm0,3
730
731
732	vpandq	ymm2,ymm2,ymm4
733	vpandq	ymm20,ymm20,ymm4
734	vpandq	ymm21,ymm21,ymm4
735	vpandq	ymm22,ymm22,ymm4
736	vpandq	ymm23,ymm23,ymm4
737
738
739	vpaddq	ymm2,ymm2,ymm24
740	vpaddq	ymm20,ymm20,ymm25
741	vpaddq	ymm21,ymm21,ymm26
742	vpaddq	ymm22,ymm22,ymm27
743	vpaddq	ymm23,ymm23,ymm28
744
745
746
747	vpcmpuq	k1,ymm4,ymm2,1
748	vpcmpuq	k2,ymm4,ymm20,1
749	vpcmpuq	k3,ymm4,ymm21,1
750	vpcmpuq	k4,ymm4,ymm22,1
751	vpcmpuq	k5,ymm4,ymm23,1
752	kmovb	r14d,k1
753	kmovb	r13d,k2
754	kmovb	r12d,k3
755	kmovb	r11d,k4
756	kmovb	r10d,k5
757
758
759	vpcmpuq	k1,ymm4,ymm2,0
760	vpcmpuq	k2,ymm4,ymm20,0
761	vpcmpuq	k3,ymm4,ymm21,0
762	vpcmpuq	k4,ymm4,ymm22,0
763	vpcmpuq	k5,ymm4,ymm23,0
764	kmovb	r9d,k1
765	kmovb	r8d,k2
766	kmovb	ebx,k3
767	kmovb	ecx,k4
768	kmovb	edx,k5
769
770
771
772	shl	r13b,4
773	or	r14b,r13b
774	shl	r11b,4
775	or	r12b,r11b
776
777	add	r14b,r14b
778	adc	r12b,r12b
779	adc	r10b,r10b
780
781	shl	r8b,4
782	or	r9b,r8b
783	shl	cl,4
784	or	bl,cl
785
786	add	r14b,r9b
787	adc	r12b,bl
788	adc	r10b,dl
789
790	xor	r14b,r9b
791	xor	r12b,bl
792	xor	r10b,dl
793
794	kmovb	k1,r14d
795	shr	r14b,4
796	kmovb	k2,r14d
797	kmovb	k3,r12d
798	shr	r12b,4
799	kmovb	k4,r12d
800	kmovb	k5,r10d
801
802
803	vpsubq	ymm2{k1},ymm2,ymm4
804	vpsubq	ymm20{k2},ymm20,ymm4
805	vpsubq	ymm21{k3},ymm21,ymm4
806	vpsubq	ymm22{k4},ymm22,ymm4
807	vpsubq	ymm23{k5},ymm23,ymm4
808
809	vpandq	ymm2,ymm2,ymm4
810	vpandq	ymm20,ymm20,ymm4
811	vpandq	ymm21,ymm21,ymm4
812	vpandq	ymm22,ymm22,ymm4
813	vpandq	ymm23,ymm23,ymm4
814
815	vmovdqu64	YMMWORD[rdi],ymm1
816	vmovdqu64	YMMWORD[32+rdi],ymm16
817	vmovdqu64	YMMWORD[64+rdi],ymm17
818	vmovdqu64	YMMWORD[96+rdi],ymm18
819	vmovdqu64	YMMWORD[128+rdi],ymm19
820
821	vmovdqu64	YMMWORD[160+rdi],ymm2
822	vmovdqu64	YMMWORD[192+rdi],ymm20
823	vmovdqu64	YMMWORD[224+rdi],ymm21
824	vmovdqu64	YMMWORD[256+rdi],ymm22
825	vmovdqu64	YMMWORD[288+rdi],ymm23
826
827	vzeroupper
828	mov	r15,QWORD[rsp]
829
830	mov	r14,QWORD[8+rsp]
831
832	mov	r13,QWORD[16+rsp]
833
834	mov	r12,QWORD[24+rsp]
835
836	mov	rbp,QWORD[32+rsp]
837
838	mov	rbx,QWORD[40+rsp]
839
840	lea	rsp,[48+rsp]
841
842$L$rsaz_amm52x20_x2_256_epilogue:
843	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
844	mov	rsi,QWORD[16+rsp]
845	DB	0F3h,0C3h		;repret
846
847$L$SEH_end_ossl_rsaz_amm52x20_x2_256:
848section	.text code align=64
849
850
851ALIGN	32
852global	ossl_extract_multiplier_2x20_win5
853
854ossl_extract_multiplier_2x20_win5:
855	mov	QWORD[8+rsp],rdi	;WIN64 prologue
856	mov	QWORD[16+rsp],rsi
857	mov	rax,rsp
858$L$SEH_begin_ossl_extract_multiplier_2x20_win5:
859	mov	rdi,rcx
860	mov	rsi,rdx
861	mov	rdx,r8
862	mov	rcx,r9
863
864
865
866DB	243,15,30,250
867	lea	rax,[rcx*4+rcx]
868	sal	rax,5
869	add	rsi,rax
870
871	vmovdqa64	ymm23,YMMWORD[$L$ones]
872	vpbroadcastq	ymm22,rdx
873	lea	rax,[10240+rsi]
874
875	vpxor	xmm4,xmm4,xmm4
876	vmovdqa64	ymm3,ymm4
877	vmovdqa64	ymm2,ymm4
878	vmovdqa64	ymm1,ymm4
879	vmovdqa64	ymm0,ymm4
880	vmovdqa64	ymm21,ymm4
881
882ALIGN	32
883$L$loop:
884	vpcmpq	k1,ymm22,ymm21,0
885	add	rsi,320
886	vpaddq	ymm21,ymm21,ymm23
887	vmovdqu64	ymm16,YMMWORD[((-320))+rsi]
888	vmovdqu64	ymm17,YMMWORD[((-288))+rsi]
889	vmovdqu64	ymm18,YMMWORD[((-256))+rsi]
890	vmovdqu64	ymm19,YMMWORD[((-224))+rsi]
891	vmovdqu64	ymm20,YMMWORD[((-192))+rsi]
892	vpblendmq	ymm0{k1},ymm0,ymm16
893	vpblendmq	ymm1{k1},ymm1,ymm17
894	vpblendmq	ymm2{k1},ymm2,ymm18
895	vpblendmq	ymm3{k1},ymm3,ymm19
896	vpblendmq	ymm4{k1},ymm4,ymm20
897	cmp	rax,rsi
898	jne	NEAR $L$loop
899
900	vmovdqu64	YMMWORD[rdi],ymm0
901	vmovdqu64	YMMWORD[32+rdi],ymm1
902	vmovdqu64	YMMWORD[64+rdi],ymm2
903	vmovdqu64	YMMWORD[96+rdi],ymm3
904	vmovdqu64	YMMWORD[128+rdi],ymm4
905
906	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
907	mov	rsi,QWORD[16+rsp]
908	DB	0F3h,0C3h		;repret
909
910$L$SEH_end_ossl_extract_multiplier_2x20_win5:
911section	.data data align=8
912
913ALIGN	32
914$L$ones:
915	DQ	1,1,1,1
916EXTERN	__imp_RtlVirtualUnwind
917
918ALIGN	16
919rsaz_def_handler:
920	push	rsi
921	push	rdi
922	push	rbx
923	push	rbp
924	push	r12
925	push	r13
926	push	r14
927	push	r15
928	pushfq
929	sub	rsp,64
930
931	mov	rax,QWORD[120+r8]
932	mov	rbx,QWORD[248+r8]
933
934	mov	rsi,QWORD[8+r9]
935	mov	r11,QWORD[56+r9]
936
937	mov	r10d,DWORD[r11]
938	lea	r10,[r10*1+rsi]
939	cmp	rbx,r10
940	jb	NEAR $L$common_seh_tail
941
942	mov	rax,QWORD[152+r8]
943
944	mov	r10d,DWORD[4+r11]
945	lea	r10,[r10*1+rsi]
946	cmp	rbx,r10
947	jae	NEAR $L$common_seh_tail
948
949	lea	rax,[48+rax]
950
951	mov	rbx,QWORD[((-8))+rax]
952	mov	rbp,QWORD[((-16))+rax]
953	mov	r12,QWORD[((-24))+rax]
954	mov	r13,QWORD[((-32))+rax]
955	mov	r14,QWORD[((-40))+rax]
956	mov	r15,QWORD[((-48))+rax]
957	mov	QWORD[144+r8],rbx
958	mov	QWORD[160+r8],rbp
959	mov	QWORD[216+r8],r12
960	mov	QWORD[224+r8],r13
961	mov	QWORD[232+r8],r14
962	mov	QWORD[240+r8],r15
963
964$L$common_seh_tail:
965	mov	rdi,QWORD[8+rax]
966	mov	rsi,QWORD[16+rax]
967	mov	QWORD[152+r8],rax
968	mov	QWORD[168+r8],rsi
969	mov	QWORD[176+r8],rdi
970
971	mov	rdi,QWORD[40+r9]
972	mov	rsi,r8
973	mov	ecx,154
974	DD	0xa548f3fc
975
976	mov	rsi,r9
977	xor	rcx,rcx
978	mov	rdx,QWORD[8+rsi]
979	mov	r8,QWORD[rsi]
980	mov	r9,QWORD[16+rsi]
981	mov	r10,QWORD[40+rsi]
982	lea	r11,[56+rsi]
983	lea	r12,[24+rsi]
984	mov	QWORD[32+rsp],r10
985	mov	QWORD[40+rsp],r11
986	mov	QWORD[48+rsp],r12
987	mov	QWORD[56+rsp],rcx
988	call	QWORD[__imp_RtlVirtualUnwind]
989
990	mov	eax,1
991	add	rsp,64
992	popfq
993	pop	r15
994	pop	r14
995	pop	r13
996	pop	r12
997	pop	rbp
998	pop	rbx
999	pop	rdi
1000	pop	rsi
1001	DB	0F3h,0C3h		;repret
1002
1003
1004section	.pdata rdata align=4
1005ALIGN	4
1006	DD	$L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
1007	DD	$L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
1008	DD	$L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
1009
1010	DD	$L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
1011	DD	$L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
1012	DD	$L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
1013
1014	DD	$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1015	DD	$L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1016	DD	$L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1017
1018section	.xdata rdata align=8
1019ALIGN	8
1020$L$SEH_info_ossl_rsaz_amm52x20_x1_256:
1021DB	9,0,0,0
1022	DD	rsaz_def_handler wrt ..imagebase
1023	DD	$L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase
1024$L$SEH_info_ossl_rsaz_amm52x20_x2_256:
1025DB	9,0,0,0
1026	DD	rsaz_def_handler wrt ..imagebase
1027	DD	$L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase
1028$L$SEH_info_ossl_extract_multiplier_2x20_win5:
1029DB	9,0,0,0
1030	DD	rsaz_def_handler wrt ..imagebase
1031	DD	$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
1032