• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4%ifdef __YASM_VERSION_ID__
5%if __YASM_VERSION_ID__ < 01010000h
6%error yasm version 1.1.0 or later needed.
7%endif
8; Yasm automatically includes .00 and complains about redefining it.
9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10%else
11$@feat.00 equ 1
12%endif
13section	.text	code align=64
14%else
15section	.text	code
16%endif
17;extern	_OPENSSL_ia32cap_P
18global	_bn_mul_mont
19align	16
20_bn_mul_mont:
21L$_bn_mul_mont_begin:
22	push	ebp
23	push	ebx
24	push	esi
25	push	edi
26	xor	eax,eax
27	mov	edi,DWORD [40+esp]
28	cmp	edi,4
29	jl	NEAR L$000just_leave
30	lea	esi,[20+esp]
31	lea	edx,[24+esp]
32	add	edi,2
33	neg	edi
34	lea	ebp,[edi*4+esp-32]
35	neg	edi
36	mov	eax,ebp
37	sub	eax,edx
38	and	eax,2047
39	sub	ebp,eax
40	xor	edx,ebp
41	and	edx,2048
42	xor	edx,2048
43	sub	ebp,edx
44	and	ebp,-64
45	mov	eax,esp
46	sub	eax,ebp
47	and	eax,-4096
48	mov	edx,esp
49	lea	esp,[eax*1+ebp]
50	mov	eax,DWORD [esp]
51	cmp	esp,ebp
52	ja	NEAR L$001page_walk
53	jmp	NEAR L$002page_walk_done
54align	16
55L$001page_walk:
56	lea	esp,[esp-4096]
57	mov	eax,DWORD [esp]
58	cmp	esp,ebp
59	ja	NEAR L$001page_walk
60L$002page_walk_done:
61	mov	eax,DWORD [esi]
62	mov	ebx,DWORD [4+esi]
63	mov	ecx,DWORD [8+esi]
64	mov	ebp,DWORD [12+esi]
65	mov	esi,DWORD [16+esi]
66	mov	esi,DWORD [esi]
67	mov	DWORD [4+esp],eax
68	mov	DWORD [8+esp],ebx
69	mov	DWORD [12+esp],ecx
70	mov	DWORD [16+esp],ebp
71	mov	DWORD [20+esp],esi
72	lea	ebx,[edi-3]
73	mov	DWORD [24+esp],edx
74	lea	eax,[_OPENSSL_ia32cap_P]
75	bt	DWORD [eax],26
76	jnc	NEAR L$003non_sse2
77	mov	eax,-1
78	movd	mm7,eax
79	mov	esi,DWORD [8+esp]
80	mov	edi,DWORD [12+esp]
81	mov	ebp,DWORD [16+esp]
82	xor	edx,edx
83	xor	ecx,ecx
84	movd	mm4,DWORD [edi]
85	movd	mm5,DWORD [esi]
86	movd	mm3,DWORD [ebp]
87	pmuludq	mm5,mm4
88	movq	mm2,mm5
89	movq	mm0,mm5
90	pand	mm0,mm7
91	pmuludq	mm5,[20+esp]
92	pmuludq	mm3,mm5
93	paddq	mm3,mm0
94	movd	mm1,DWORD [4+ebp]
95	movd	mm0,DWORD [4+esi]
96	psrlq	mm2,32
97	psrlq	mm3,32
98	inc	ecx
99align	16
100L$0041st:
101	pmuludq	mm0,mm4
102	pmuludq	mm1,mm5
103	paddq	mm2,mm0
104	paddq	mm3,mm1
105	movq	mm0,mm2
106	pand	mm0,mm7
107	movd	mm1,DWORD [4+ecx*4+ebp]
108	paddq	mm3,mm0
109	movd	mm0,DWORD [4+ecx*4+esi]
110	psrlq	mm2,32
111	movd	DWORD [28+ecx*4+esp],mm3
112	psrlq	mm3,32
113	lea	ecx,[1+ecx]
114	cmp	ecx,ebx
115	jl	NEAR L$0041st
116	pmuludq	mm0,mm4
117	pmuludq	mm1,mm5
118	paddq	mm2,mm0
119	paddq	mm3,mm1
120	movq	mm0,mm2
121	pand	mm0,mm7
122	paddq	mm3,mm0
123	movd	DWORD [28+ecx*4+esp],mm3
124	psrlq	mm2,32
125	psrlq	mm3,32
126	paddq	mm3,mm2
127	movq	[32+ebx*4+esp],mm3
128	inc	edx
129L$005outer:
130	xor	ecx,ecx
131	movd	mm4,DWORD [edx*4+edi]
132	movd	mm5,DWORD [esi]
133	movd	mm6,DWORD [32+esp]
134	movd	mm3,DWORD [ebp]
135	pmuludq	mm5,mm4
136	paddq	mm5,mm6
137	movq	mm0,mm5
138	movq	mm2,mm5
139	pand	mm0,mm7
140	pmuludq	mm5,[20+esp]
141	pmuludq	mm3,mm5
142	paddq	mm3,mm0
143	movd	mm6,DWORD [36+esp]
144	movd	mm1,DWORD [4+ebp]
145	movd	mm0,DWORD [4+esi]
146	psrlq	mm2,32
147	psrlq	mm3,32
148	paddq	mm2,mm6
149	inc	ecx
150	dec	ebx
151L$006inner:
152	pmuludq	mm0,mm4
153	pmuludq	mm1,mm5
154	paddq	mm2,mm0
155	paddq	mm3,mm1
156	movq	mm0,mm2
157	movd	mm6,DWORD [36+ecx*4+esp]
158	pand	mm0,mm7
159	movd	mm1,DWORD [4+ecx*4+ebp]
160	paddq	mm3,mm0
161	movd	mm0,DWORD [4+ecx*4+esi]
162	psrlq	mm2,32
163	movd	DWORD [28+ecx*4+esp],mm3
164	psrlq	mm3,32
165	paddq	mm2,mm6
166	dec	ebx
167	lea	ecx,[1+ecx]
168	jnz	NEAR L$006inner
169	mov	ebx,ecx
170	pmuludq	mm0,mm4
171	pmuludq	mm1,mm5
172	paddq	mm2,mm0
173	paddq	mm3,mm1
174	movq	mm0,mm2
175	pand	mm0,mm7
176	paddq	mm3,mm0
177	movd	DWORD [28+ecx*4+esp],mm3
178	psrlq	mm2,32
179	psrlq	mm3,32
180	movd	mm6,DWORD [36+ebx*4+esp]
181	paddq	mm3,mm2
182	paddq	mm3,mm6
183	movq	[32+ebx*4+esp],mm3
184	lea	edx,[1+edx]
185	cmp	edx,ebx
186	jle	NEAR L$005outer
187	emms
188	jmp	NEAR L$007common_tail
189align	16
190L$003non_sse2:
191	mov	esi,DWORD [8+esp]
192	lea	ebp,[1+ebx]
193	mov	edi,DWORD [12+esp]
194	xor	ecx,ecx
195	mov	edx,esi
196	and	ebp,1
197	sub	edx,edi
198	lea	eax,[4+ebx*4+edi]
199	or	ebp,edx
200	mov	edi,DWORD [edi]
201	jz	NEAR L$008bn_sqr_mont
202	mov	DWORD [28+esp],eax
203	mov	eax,DWORD [esi]
204	xor	edx,edx
205align	16
206L$009mull:
207	mov	ebp,edx
208	mul	edi
209	add	ebp,eax
210	lea	ecx,[1+ecx]
211	adc	edx,0
212	mov	eax,DWORD [ecx*4+esi]
213	cmp	ecx,ebx
214	mov	DWORD [28+ecx*4+esp],ebp
215	jl	NEAR L$009mull
216	mov	ebp,edx
217	mul	edi
218	mov	edi,DWORD [20+esp]
219	add	eax,ebp
220	mov	esi,DWORD [16+esp]
221	adc	edx,0
222	imul	edi,DWORD [32+esp]
223	mov	DWORD [32+ebx*4+esp],eax
224	xor	ecx,ecx
225	mov	DWORD [36+ebx*4+esp],edx
226	mov	DWORD [40+ebx*4+esp],ecx
227	mov	eax,DWORD [esi]
228	mul	edi
229	add	eax,DWORD [32+esp]
230	mov	eax,DWORD [4+esi]
231	adc	edx,0
232	inc	ecx
233	jmp	NEAR L$0102ndmadd
234align	16
235L$0111stmadd:
236	mov	ebp,edx
237	mul	edi
238	add	ebp,DWORD [32+ecx*4+esp]
239	lea	ecx,[1+ecx]
240	adc	edx,0
241	add	ebp,eax
242	mov	eax,DWORD [ecx*4+esi]
243	adc	edx,0
244	cmp	ecx,ebx
245	mov	DWORD [28+ecx*4+esp],ebp
246	jl	NEAR L$0111stmadd
247	mov	ebp,edx
248	mul	edi
249	add	eax,DWORD [32+ebx*4+esp]
250	mov	edi,DWORD [20+esp]
251	adc	edx,0
252	mov	esi,DWORD [16+esp]
253	add	ebp,eax
254	adc	edx,0
255	imul	edi,DWORD [32+esp]
256	xor	ecx,ecx
257	add	edx,DWORD [36+ebx*4+esp]
258	mov	DWORD [32+ebx*4+esp],ebp
259	adc	ecx,0
260	mov	eax,DWORD [esi]
261	mov	DWORD [36+ebx*4+esp],edx
262	mov	DWORD [40+ebx*4+esp],ecx
263	mul	edi
264	add	eax,DWORD [32+esp]
265	mov	eax,DWORD [4+esi]
266	adc	edx,0
267	mov	ecx,1
268align	16
269L$0102ndmadd:
270	mov	ebp,edx
271	mul	edi
272	add	ebp,DWORD [32+ecx*4+esp]
273	lea	ecx,[1+ecx]
274	adc	edx,0
275	add	ebp,eax
276	mov	eax,DWORD [ecx*4+esi]
277	adc	edx,0
278	cmp	ecx,ebx
279	mov	DWORD [24+ecx*4+esp],ebp
280	jl	NEAR L$0102ndmadd
281	mov	ebp,edx
282	mul	edi
283	add	ebp,DWORD [32+ebx*4+esp]
284	adc	edx,0
285	add	ebp,eax
286	adc	edx,0
287	mov	DWORD [28+ebx*4+esp],ebp
288	xor	eax,eax
289	mov	ecx,DWORD [12+esp]
290	add	edx,DWORD [36+ebx*4+esp]
291	adc	eax,DWORD [40+ebx*4+esp]
292	lea	ecx,[4+ecx]
293	mov	DWORD [32+ebx*4+esp],edx
294	cmp	ecx,DWORD [28+esp]
295	mov	DWORD [36+ebx*4+esp],eax
296	je	NEAR L$007common_tail
297	mov	edi,DWORD [ecx]
298	mov	esi,DWORD [8+esp]
299	mov	DWORD [12+esp],ecx
300	xor	ecx,ecx
301	xor	edx,edx
302	mov	eax,DWORD [esi]
303	jmp	NEAR L$0111stmadd
304align	16
305L$008bn_sqr_mont:
306	mov	DWORD [esp],ebx
307	mov	DWORD [12+esp],ecx
308	mov	eax,edi
309	mul	edi
310	mov	DWORD [32+esp],eax
311	mov	ebx,edx
312	shr	edx,1
313	and	ebx,1
314	inc	ecx
315align	16
316L$012sqr:
317	mov	eax,DWORD [ecx*4+esi]
318	mov	ebp,edx
319	mul	edi
320	add	eax,ebp
321	lea	ecx,[1+ecx]
322	adc	edx,0
323	lea	ebp,[eax*2+ebx]
324	shr	eax,31
325	cmp	ecx,DWORD [esp]
326	mov	ebx,eax
327	mov	DWORD [28+ecx*4+esp],ebp
328	jl	NEAR L$012sqr
329	mov	eax,DWORD [ecx*4+esi]
330	mov	ebp,edx
331	mul	edi
332	add	eax,ebp
333	mov	edi,DWORD [20+esp]
334	adc	edx,0
335	mov	esi,DWORD [16+esp]
336	lea	ebp,[eax*2+ebx]
337	imul	edi,DWORD [32+esp]
338	shr	eax,31
339	mov	DWORD [32+ecx*4+esp],ebp
340	lea	ebp,[edx*2+eax]
341	mov	eax,DWORD [esi]
342	shr	edx,31
343	mov	DWORD [36+ecx*4+esp],ebp
344	mov	DWORD [40+ecx*4+esp],edx
345	mul	edi
346	add	eax,DWORD [32+esp]
347	mov	ebx,ecx
348	adc	edx,0
349	mov	eax,DWORD [4+esi]
350	mov	ecx,1
351align	16
352L$0133rdmadd:
353	mov	ebp,edx
354	mul	edi
355	add	ebp,DWORD [32+ecx*4+esp]
356	adc	edx,0
357	add	ebp,eax
358	mov	eax,DWORD [4+ecx*4+esi]
359	adc	edx,0
360	mov	DWORD [28+ecx*4+esp],ebp
361	mov	ebp,edx
362	mul	edi
363	add	ebp,DWORD [36+ecx*4+esp]
364	lea	ecx,[2+ecx]
365	adc	edx,0
366	add	ebp,eax
367	mov	eax,DWORD [ecx*4+esi]
368	adc	edx,0
369	cmp	ecx,ebx
370	mov	DWORD [24+ecx*4+esp],ebp
371	jl	NEAR L$0133rdmadd
372	mov	ebp,edx
373	mul	edi
374	add	ebp,DWORD [32+ebx*4+esp]
375	adc	edx,0
376	add	ebp,eax
377	adc	edx,0
378	mov	DWORD [28+ebx*4+esp],ebp
379	mov	ecx,DWORD [12+esp]
380	xor	eax,eax
381	mov	esi,DWORD [8+esp]
382	add	edx,DWORD [36+ebx*4+esp]
383	adc	eax,DWORD [40+ebx*4+esp]
384	mov	DWORD [32+ebx*4+esp],edx
385	cmp	ecx,ebx
386	mov	DWORD [36+ebx*4+esp],eax
387	je	NEAR L$007common_tail
388	mov	edi,DWORD [4+ecx*4+esi]
389	lea	ecx,[1+ecx]
390	mov	eax,edi
391	mov	DWORD [12+esp],ecx
392	mul	edi
393	add	eax,DWORD [32+ecx*4+esp]
394	adc	edx,0
395	mov	DWORD [32+ecx*4+esp],eax
396	xor	ebp,ebp
397	cmp	ecx,ebx
398	lea	ecx,[1+ecx]
399	je	NEAR L$014sqrlast
400	mov	ebx,edx
401	shr	edx,1
402	and	ebx,1
403align	16
404L$015sqradd:
405	mov	eax,DWORD [ecx*4+esi]
406	mov	ebp,edx
407	mul	edi
408	add	eax,ebp
409	lea	ebp,[eax*1+eax]
410	adc	edx,0
411	shr	eax,31
412	add	ebp,DWORD [32+ecx*4+esp]
413	lea	ecx,[1+ecx]
414	adc	eax,0
415	add	ebp,ebx
416	adc	eax,0
417	cmp	ecx,DWORD [esp]
418	mov	DWORD [28+ecx*4+esp],ebp
419	mov	ebx,eax
420	jle	NEAR L$015sqradd
421	mov	ebp,edx
422	add	edx,edx
423	shr	ebp,31
424	add	edx,ebx
425	adc	ebp,0
426L$014sqrlast:
427	mov	edi,DWORD [20+esp]
428	mov	esi,DWORD [16+esp]
429	imul	edi,DWORD [32+esp]
430	add	edx,DWORD [32+ecx*4+esp]
431	mov	eax,DWORD [esi]
432	adc	ebp,0
433	mov	DWORD [32+ecx*4+esp],edx
434	mov	DWORD [36+ecx*4+esp],ebp
435	mul	edi
436	add	eax,DWORD [32+esp]
437	lea	ebx,[ecx-1]
438	adc	edx,0
439	mov	ecx,1
440	mov	eax,DWORD [4+esi]
441	jmp	NEAR L$0133rdmadd
442align	16
443L$007common_tail:
444	mov	ebp,DWORD [16+esp]
445	mov	edi,DWORD [4+esp]
446	lea	esi,[32+esp]
447	mov	eax,DWORD [esi]
448	mov	ecx,ebx
449	xor	edx,edx
450align	16
451L$016sub:
452	sbb	eax,DWORD [edx*4+ebp]
453	mov	DWORD [edx*4+edi],eax
454	dec	ecx
455	mov	eax,DWORD [4+edx*4+esi]
456	lea	edx,[1+edx]
457	jge	NEAR L$016sub
458	sbb	eax,0
459	and	esi,eax
460	not	eax
461	mov	ebp,edi
462	and	ebp,eax
463	or	esi,ebp
464align	16
465L$017copy:
466	mov	eax,DWORD [ebx*4+esi]
467	mov	DWORD [ebx*4+edi],eax
468	mov	DWORD [32+ebx*4+esp],ecx
469	dec	ebx
470	jge	NEAR L$017copy
471	mov	esp,DWORD [24+esp]
472	mov	eax,1
473L$000just_leave:
474	pop	edi
475	pop	esi
476	pop	ebx
477	pop	ebp
478	ret
479db	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
480db	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
481db	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
482db	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
483db	111,114,103,62,0
484segment	.bss
485common	_OPENSSL_ia32cap_P 16
486