• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17.text
18
19.globl	_bn_mul_mont
20.private_extern	_bn_mul_mont
21
22.align	5
23_bn_mul_mont:
24	AARCH64_SIGN_LINK_REGISTER
25	tst	x5,#7
26	b.eq	__bn_sqr8x_mont
27	tst	x5,#3
28	b.eq	__bn_mul4x_mont
29Lmul_mont:
30	stp	x29,x30,[sp,#-64]!
31	add	x29,sp,#0
32	stp	x19,x20,[sp,#16]
33	stp	x21,x22,[sp,#32]
34	stp	x23,x24,[sp,#48]
35
36	ldr	x9,[x2],#8		// bp[0]
37	sub	x22,sp,x5,lsl#3
38	ldp	x7,x8,[x1],#16	// ap[0..1]
39	lsl	x5,x5,#3
40	ldr	x4,[x4]		// *n0
41	and	x22,x22,#-16		// ABI says so
42	ldp	x13,x14,[x3],#16	// np[0..1]
43
44	mul	x6,x7,x9		// ap[0]*bp[0]
45	sub	x21,x5,#16		// j=num-2
46	umulh	x7,x7,x9
47	mul	x10,x8,x9		// ap[1]*bp[0]
48	umulh	x11,x8,x9
49
50	mul	x15,x6,x4		// "tp[0]"*n0
51	mov	sp,x22			// alloca
52
53	// (*)	mul	x12,x13,x15	// np[0]*m1
54	umulh	x13,x13,x15
55	mul	x16,x14,x15		// np[1]*m1
56	// (*)	adds	x12,x12,x6	// discarded
57	// (*)	As for removal of first multiplication and addition
58	//	instructions. The outcome of first addition is
59	//	guaranteed to be zero, which leaves two computationally
60	//	significant outcomes: it either carries or not. Then
61	//	question is when does it carry? Is there alternative
62	//	way to deduce it? If you follow operations, you can
63	//	observe that condition for carry is quite simple:
64	//	x6 being non-zero. So that carry can be calculated
65	//	by adding -1 to x6. That's what next instruction does.
66	subs	xzr,x6,#1		// (*)
67	umulh	x17,x14,x15
68	adc	x13,x13,xzr
69	cbz	x21,L1st_skip
70
71L1st:
72	ldr	x8,[x1],#8
73	adds	x6,x10,x7
74	sub	x21,x21,#8		// j--
75	adc	x7,x11,xzr
76
77	ldr	x14,[x3],#8
78	adds	x12,x16,x13
79	mul	x10,x8,x9		// ap[j]*bp[0]
80	adc	x13,x17,xzr
81	umulh	x11,x8,x9
82
83	adds	x12,x12,x6
84	mul	x16,x14,x15		// np[j]*m1
85	adc	x13,x13,xzr
86	umulh	x17,x14,x15
87	str	x12,[x22],#8		// tp[j-1]
88	cbnz	x21,L1st
89
90L1st_skip:
91	adds	x6,x10,x7
92	sub	x1,x1,x5		// rewind x1
93	adc	x7,x11,xzr
94
95	adds	x12,x16,x13
96	sub	x3,x3,x5		// rewind x3
97	adc	x13,x17,xzr
98
99	adds	x12,x12,x6
100	sub	x20,x5,#8		// i=num-1
101	adcs	x13,x13,x7
102
103	adc	x19,xzr,xzr		// upmost overflow bit
104	stp	x12,x13,[x22]
105
106Louter:
107	ldr	x9,[x2],#8		// bp[i]
108	ldp	x7,x8,[x1],#16
109	ldr	x23,[sp]		// tp[0]
110	add	x22,sp,#8
111
112	mul	x6,x7,x9		// ap[0]*bp[i]
113	sub	x21,x5,#16		// j=num-2
114	umulh	x7,x7,x9
115	ldp	x13,x14,[x3],#16
116	mul	x10,x8,x9		// ap[1]*bp[i]
117	adds	x6,x6,x23
118	umulh	x11,x8,x9
119	adc	x7,x7,xzr
120
121	mul	x15,x6,x4
122	sub	x20,x20,#8		// i--
123
124	// (*)	mul	x12,x13,x15	// np[0]*m1
125	umulh	x13,x13,x15
126	mul	x16,x14,x15		// np[1]*m1
127	// (*)	adds	x12,x12,x6
128	subs	xzr,x6,#1		// (*)
129	umulh	x17,x14,x15
130	cbz	x21,Linner_skip
131
132Linner:
133	ldr	x8,[x1],#8
134	adc	x13,x13,xzr
135	ldr	x23,[x22],#8		// tp[j]
136	adds	x6,x10,x7
137	sub	x21,x21,#8		// j--
138	adc	x7,x11,xzr
139
140	adds	x12,x16,x13
141	ldr	x14,[x3],#8
142	adc	x13,x17,xzr
143
144	mul	x10,x8,x9		// ap[j]*bp[i]
145	adds	x6,x6,x23
146	umulh	x11,x8,x9
147	adc	x7,x7,xzr
148
149	mul	x16,x14,x15		// np[j]*m1
150	adds	x12,x12,x6
151	umulh	x17,x14,x15
152	str	x12,[x22,#-16]		// tp[j-1]
153	cbnz	x21,Linner
154
155Linner_skip:
156	ldr	x23,[x22],#8		// tp[j]
157	adc	x13,x13,xzr
158	adds	x6,x10,x7
159	sub	x1,x1,x5		// rewind x1
160	adc	x7,x11,xzr
161
162	adds	x12,x16,x13
163	sub	x3,x3,x5		// rewind x3
164	adcs	x13,x17,x19
165	adc	x19,xzr,xzr
166
167	adds	x6,x6,x23
168	adc	x7,x7,xzr
169
170	adds	x12,x12,x6
171	adcs	x13,x13,x7
172	adc	x19,x19,xzr		// upmost overflow bit
173	stp	x12,x13,[x22,#-16]
174
175	cbnz	x20,Louter
176
177	// Final step. We see if result is larger than modulus, and
178	// if it is, subtract the modulus. But comparison implies
179	// subtraction. So we subtract modulus, see if it borrowed,
180	// and conditionally copy original value.
181	ldr	x23,[sp]		// tp[0]
182	add	x22,sp,#8
183	ldr	x14,[x3],#8		// np[0]
184	subs	x21,x5,#8		// j=num-1 and clear borrow
185	mov	x1,x0
186Lsub:
187	sbcs	x8,x23,x14		// tp[j]-np[j]
188	ldr	x23,[x22],#8
189	sub	x21,x21,#8		// j--
190	ldr	x14,[x3],#8
191	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
192	cbnz	x21,Lsub
193
194	sbcs	x8,x23,x14
195	sbcs	x19,x19,xzr		// did it borrow?
196	str	x8,[x1],#8		// rp[num-1]
197
198	ldr	x23,[sp]		// tp[0]
199	add	x22,sp,#8
200	ldr	x8,[x0],#8		// rp[0]
201	sub	x5,x5,#8		// num--
202	nop
203Lcond_copy:
204	sub	x5,x5,#8		// num--
205	csel	x14,x23,x8,lo		// did it borrow?
206	ldr	x23,[x22],#8
207	ldr	x8,[x0],#8
208	str	xzr,[x22,#-16]		// wipe tp
209	str	x14,[x0,#-16]
210	cbnz	x5,Lcond_copy
211
212	csel	x14,x23,x8,lo
213	str	xzr,[x22,#-8]		// wipe tp
214	str	x14,[x0,#-8]
215
216	ldp	x19,x20,[x29,#16]
217	mov	sp,x29
218	ldp	x21,x22,[x29,#32]
219	mov	x0,#1
220	ldp	x23,x24,[x29,#48]
221	ldr	x29,[sp],#64
222	AARCH64_VALIDATE_LINK_REGISTER
223	ret
224
225
226.align	5
227__bn_sqr8x_mont:
228	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
229	// only from bn_mul_mont which has already signed the return address.
230	cmp	x1,x2
231	b.ne	__bn_mul4x_mont
232Lsqr8x_mont:
233	stp	x29,x30,[sp,#-128]!
234	add	x29,sp,#0
235	stp	x19,x20,[sp,#16]
236	stp	x21,x22,[sp,#32]
237	stp	x23,x24,[sp,#48]
238	stp	x25,x26,[sp,#64]
239	stp	x27,x28,[sp,#80]
240	stp	x0,x3,[sp,#96]	// offload rp and np
241
242	ldp	x6,x7,[x1,#8*0]
243	ldp	x8,x9,[x1,#8*2]
244	ldp	x10,x11,[x1,#8*4]
245	ldp	x12,x13,[x1,#8*6]
246
247	sub	x2,sp,x5,lsl#4
248	lsl	x5,x5,#3
249	ldr	x4,[x4]		// *n0
250	mov	sp,x2			// alloca
251	sub	x27,x5,#8*8
252	b	Lsqr8x_zero_start
253
254Lsqr8x_zero:
255	sub	x27,x27,#8*8
256	stp	xzr,xzr,[x2,#8*0]
257	stp	xzr,xzr,[x2,#8*2]
258	stp	xzr,xzr,[x2,#8*4]
259	stp	xzr,xzr,[x2,#8*6]
260Lsqr8x_zero_start:
261	stp	xzr,xzr,[x2,#8*8]
262	stp	xzr,xzr,[x2,#8*10]
263	stp	xzr,xzr,[x2,#8*12]
264	stp	xzr,xzr,[x2,#8*14]
265	add	x2,x2,#8*16
266	cbnz	x27,Lsqr8x_zero
267
268	add	x3,x1,x5
269	add	x1,x1,#8*8
270	mov	x19,xzr
271	mov	x20,xzr
272	mov	x21,xzr
273	mov	x22,xzr
274	mov	x23,xzr
275	mov	x24,xzr
276	mov	x25,xzr
277	mov	x26,xzr
278	mov	x2,sp
279	str	x4,[x29,#112]		// offload n0
280
281	// Multiply everything but a[i]*a[i]
282.align	4
283Lsqr8x_outer_loop:
284        //                                                 a[1]a[0]	(i)
285        //                                             a[2]a[0]
286        //                                         a[3]a[0]
287        //                                     a[4]a[0]
288        //                                 a[5]a[0]
289        //                             a[6]a[0]
290        //                         a[7]a[0]
291        //                                         a[2]a[1]		(ii)
292        //                                     a[3]a[1]
293        //                                 a[4]a[1]
294        //                             a[5]a[1]
295        //                         a[6]a[1]
296        //                     a[7]a[1]
297        //                                 a[3]a[2]			(iii)
298        //                             a[4]a[2]
299        //                         a[5]a[2]
300        //                     a[6]a[2]
301        //                 a[7]a[2]
302        //                         a[4]a[3]				(iv)
303        //                     a[5]a[3]
304        //                 a[6]a[3]
305        //             a[7]a[3]
306        //                 a[5]a[4]					(v)
307        //             a[6]a[4]
308        //         a[7]a[4]
309        //         a[6]a[5]						(vi)
310        //     a[7]a[5]
311        // a[7]a[6]							(vii)
312
313	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
314	mul	x15,x8,x6
315	mul	x16,x9,x6
316	mul	x17,x10,x6
317	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
318	mul	x14,x11,x6
319	adcs	x21,x21,x15
320	mul	x15,x12,x6
321	adcs	x22,x22,x16
322	mul	x16,x13,x6
323	adcs	x23,x23,x17
324	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
325	adcs	x24,x24,x14
326	umulh	x14,x8,x6
327	adcs	x25,x25,x15
328	umulh	x15,x9,x6
329	adcs	x26,x26,x16
330	umulh	x16,x10,x6
331	stp	x19,x20,[x2],#8*2	// t[0..1]
332	adc	x19,xzr,xzr		// t[8]
333	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
334	umulh	x17,x11,x6
335	adcs	x22,x22,x14
336	umulh	x14,x12,x6
337	adcs	x23,x23,x15
338	umulh	x15,x13,x6
339	adcs	x24,x24,x16
340	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
341	adcs	x25,x25,x17
342	mul	x17,x9,x7
343	adcs	x26,x26,x14
344	mul	x14,x10,x7
345	adc	x19,x19,x15
346
347	mul	x15,x11,x7
348	adds	x22,x22,x16
349	mul	x16,x12,x7
350	adcs	x23,x23,x17
351	mul	x17,x13,x7
352	adcs	x24,x24,x14
353	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
354	adcs	x25,x25,x15
355	umulh	x15,x9,x7
356	adcs	x26,x26,x16
357	umulh	x16,x10,x7
358	adcs	x19,x19,x17
359	umulh	x17,x11,x7
360	stp	x21,x22,[x2],#8*2	// t[2..3]
361	adc	x20,xzr,xzr		// t[9]
362	adds	x23,x23,x14
363	umulh	x14,x12,x7
364	adcs	x24,x24,x15
365	umulh	x15,x13,x7
366	adcs	x25,x25,x16
367	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
368	adcs	x26,x26,x17
369	mul	x17,x10,x8
370	adcs	x19,x19,x14
371	mul	x14,x11,x8
372	adc	x20,x20,x15
373
374	mul	x15,x12,x8
375	adds	x24,x24,x16
376	mul	x16,x13,x8
377	adcs	x25,x25,x17
378	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
379	adcs	x26,x26,x14
380	umulh	x14,x10,x8
381	adcs	x19,x19,x15
382	umulh	x15,x11,x8
383	adcs	x20,x20,x16
384	umulh	x16,x12,x8
385	stp	x23,x24,[x2],#8*2	// t[4..5]
386	adc	x21,xzr,xzr		// t[10]
387	adds	x25,x25,x17
388	umulh	x17,x13,x8
389	adcs	x26,x26,x14
390	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
391	adcs	x19,x19,x15
392	mul	x15,x11,x9
393	adcs	x20,x20,x16
394	mul	x16,x12,x9
395	adc	x21,x21,x17
396
397	mul	x17,x13,x9
398	adds	x26,x26,x14
399	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
400	adcs	x19,x19,x15
401	umulh	x15,x11,x9
402	adcs	x20,x20,x16
403	umulh	x16,x12,x9
404	adcs	x21,x21,x17
405	umulh	x17,x13,x9
406	stp	x25,x26,[x2],#8*2	// t[6..7]
407	adc	x22,xzr,xzr		// t[11]
408	adds	x19,x19,x14
409	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
410	adcs	x20,x20,x15
411	mul	x15,x12,x10
412	adcs	x21,x21,x16
413	mul	x16,x13,x10
414	adc	x22,x22,x17
415
416	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
417	adds	x20,x20,x14
418	umulh	x14,x12,x10
419	adcs	x21,x21,x15
420	umulh	x15,x13,x10
421	adcs	x22,x22,x16
422	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
423	adc	x23,xzr,xzr		// t[12]
424	adds	x21,x21,x17
425	mul	x17,x13,x11
426	adcs	x22,x22,x14
427	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
428	adc	x23,x23,x15
429
430	umulh	x15,x13,x11
431	adds	x22,x22,x16
432	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
433	adcs	x23,x23,x17
434	umulh	x17,x13,x12		// hi(a[7]*a[6])
435	adc	x24,xzr,xzr		// t[13]
436	adds	x23,x23,x14
437	sub	x27,x3,x1	// done yet?
438	adc	x24,x24,x15
439
440	adds	x24,x24,x16
441	sub	x14,x3,x5	// rewinded ap
442	adc	x25,xzr,xzr		// t[14]
443	add	x25,x25,x17
444
445	cbz	x27,Lsqr8x_outer_break
446
447	mov	x4,x6
448	ldp	x6,x7,[x2,#8*0]
449	ldp	x8,x9,[x2,#8*2]
450	ldp	x10,x11,[x2,#8*4]
451	ldp	x12,x13,[x2,#8*6]
452	adds	x19,x19,x6
453	adcs	x20,x20,x7
454	ldp	x6,x7,[x1,#8*0]
455	adcs	x21,x21,x8
456	adcs	x22,x22,x9
457	ldp	x8,x9,[x1,#8*2]
458	adcs	x23,x23,x10
459	adcs	x24,x24,x11
460	ldp	x10,x11,[x1,#8*4]
461	adcs	x25,x25,x12
462	mov	x0,x1
463	adcs	x26,xzr,x13
464	ldp	x12,x13,[x1,#8*6]
465	add	x1,x1,#8*8
466	//adc	x28,xzr,xzr		// moved below
467	mov	x27,#-8*8
468
469	//                                                         a[8]a[0]
470	//                                                     a[9]a[0]
471	//                                                 a[a]a[0]
472	//                                             a[b]a[0]
473	//                                         a[c]a[0]
474	//                                     a[d]a[0]
475	//                                 a[e]a[0]
476	//                             a[f]a[0]
477	//                                                     a[8]a[1]
478	//                         a[f]a[1]........................
479	//                                                 a[8]a[2]
480	//                     a[f]a[2]........................
481	//                                             a[8]a[3]
482	//                 a[f]a[3]........................
483	//                                         a[8]a[4]
484	//             a[f]a[4]........................
485	//                                     a[8]a[5]
486	//         a[f]a[5]........................
487	//                                 a[8]a[6]
488	//     a[f]a[6]........................
489	//                             a[8]a[7]
490	// a[f]a[7]........................
491Lsqr8x_mul:
492	mul	x14,x6,x4
493	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
494	mul	x15,x7,x4
495	add	x27,x27,#8
496	mul	x16,x8,x4
497	mul	x17,x9,x4
498	adds	x19,x19,x14
499	mul	x14,x10,x4
500	adcs	x20,x20,x15
501	mul	x15,x11,x4
502	adcs	x21,x21,x16
503	mul	x16,x12,x4
504	adcs	x22,x22,x17
505	mul	x17,x13,x4
506	adcs	x23,x23,x14
507	umulh	x14,x6,x4
508	adcs	x24,x24,x15
509	umulh	x15,x7,x4
510	adcs	x25,x25,x16
511	umulh	x16,x8,x4
512	adcs	x26,x26,x17
513	umulh	x17,x9,x4
514	adc	x28,x28,xzr
515	str	x19,[x2],#8
516	adds	x19,x20,x14
517	umulh	x14,x10,x4
518	adcs	x20,x21,x15
519	umulh	x15,x11,x4
520	adcs	x21,x22,x16
521	umulh	x16,x12,x4
522	adcs	x22,x23,x17
523	umulh	x17,x13,x4
524	ldr	x4,[x0,x27]
525	adcs	x23,x24,x14
526	adcs	x24,x25,x15
527	adcs	x25,x26,x16
528	adcs	x26,x28,x17
529	//adc	x28,xzr,xzr		// moved above
530	cbnz	x27,Lsqr8x_mul
531					// note that carry flag is guaranteed
532					// to be zero at this point
533	cmp	x1,x3		// done yet?
534	b.eq	Lsqr8x_break
535
536	ldp	x6,x7,[x2,#8*0]
537	ldp	x8,x9,[x2,#8*2]
538	ldp	x10,x11,[x2,#8*4]
539	ldp	x12,x13,[x2,#8*6]
540	adds	x19,x19,x6
541	ldr	x4,[x0,#-8*8]
542	adcs	x20,x20,x7
543	ldp	x6,x7,[x1,#8*0]
544	adcs	x21,x21,x8
545	adcs	x22,x22,x9
546	ldp	x8,x9,[x1,#8*2]
547	adcs	x23,x23,x10
548	adcs	x24,x24,x11
549	ldp	x10,x11,[x1,#8*4]
550	adcs	x25,x25,x12
551	mov	x27,#-8*8
552	adcs	x26,x26,x13
553	ldp	x12,x13,[x1,#8*6]
554	add	x1,x1,#8*8
555	//adc	x28,xzr,xzr		// moved above
556	b	Lsqr8x_mul
557
558.align	4
559Lsqr8x_break:
560	ldp	x6,x7,[x0,#8*0]
561	add	x1,x0,#8*8
562	ldp	x8,x9,[x0,#8*2]
563	sub	x14,x3,x1		// is it last iteration?
564	ldp	x10,x11,[x0,#8*4]
565	sub	x15,x2,x14
566	ldp	x12,x13,[x0,#8*6]
567	cbz	x14,Lsqr8x_outer_loop
568
569	stp	x19,x20,[x2,#8*0]
570	ldp	x19,x20,[x15,#8*0]
571	stp	x21,x22,[x2,#8*2]
572	ldp	x21,x22,[x15,#8*2]
573	stp	x23,x24,[x2,#8*4]
574	ldp	x23,x24,[x15,#8*4]
575	stp	x25,x26,[x2,#8*6]
576	mov	x2,x15
577	ldp	x25,x26,[x15,#8*6]
578	b	Lsqr8x_outer_loop
579
580.align	4
581Lsqr8x_outer_break:
582	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
583	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
584	ldp	x15,x16,[sp,#8*1]
585	ldp	x11,x13,[x14,#8*2]
586	add	x1,x14,#8*4
587	ldp	x17,x14,[sp,#8*3]
588
589	stp	x19,x20,[x2,#8*0]
590	mul	x19,x7,x7
591	stp	x21,x22,[x2,#8*2]
592	umulh	x7,x7,x7
593	stp	x23,x24,[x2,#8*4]
594	mul	x8,x9,x9
595	stp	x25,x26,[x2,#8*6]
596	mov	x2,sp
597	umulh	x9,x9,x9
598	adds	x20,x7,x15,lsl#1
599	extr	x15,x16,x15,#63
600	sub	x27,x5,#8*4
601
602Lsqr4x_shift_n_add:
603	adcs	x21,x8,x15
604	extr	x16,x17,x16,#63
605	sub	x27,x27,#8*4
606	adcs	x22,x9,x16
607	ldp	x15,x16,[x2,#8*5]
608	mul	x10,x11,x11
609	ldp	x7,x9,[x1],#8*2
610	umulh	x11,x11,x11
611	mul	x12,x13,x13
612	umulh	x13,x13,x13
613	extr	x17,x14,x17,#63
614	stp	x19,x20,[x2,#8*0]
615	adcs	x23,x10,x17
616	extr	x14,x15,x14,#63
617	stp	x21,x22,[x2,#8*2]
618	adcs	x24,x11,x14
619	ldp	x17,x14,[x2,#8*7]
620	extr	x15,x16,x15,#63
621	adcs	x25,x12,x15
622	extr	x16,x17,x16,#63
623	adcs	x26,x13,x16
624	ldp	x15,x16,[x2,#8*9]
625	mul	x6,x7,x7
626	ldp	x11,x13,[x1],#8*2
627	umulh	x7,x7,x7
628	mul	x8,x9,x9
629	umulh	x9,x9,x9
630	stp	x23,x24,[x2,#8*4]
631	extr	x17,x14,x17,#63
632	stp	x25,x26,[x2,#8*6]
633	add	x2,x2,#8*8
634	adcs	x19,x6,x17
635	extr	x14,x15,x14,#63
636	adcs	x20,x7,x14
637	ldp	x17,x14,[x2,#8*3]
638	extr	x15,x16,x15,#63
639	cbnz	x27,Lsqr4x_shift_n_add
640	ldp	x1,x4,[x29,#104]	// pull np and n0
641
642	adcs	x21,x8,x15
643	extr	x16,x17,x16,#63
644	adcs	x22,x9,x16
645	ldp	x15,x16,[x2,#8*5]
646	mul	x10,x11,x11
647	umulh	x11,x11,x11
648	stp	x19,x20,[x2,#8*0]
649	mul	x12,x13,x13
650	umulh	x13,x13,x13
651	stp	x21,x22,[x2,#8*2]
652	extr	x17,x14,x17,#63
653	adcs	x23,x10,x17
654	extr	x14,x15,x14,#63
655	ldp	x19,x20,[sp,#8*0]
656	adcs	x24,x11,x14
657	extr	x15,x16,x15,#63
658	ldp	x6,x7,[x1,#8*0]
659	adcs	x25,x12,x15
660	extr	x16,xzr,x16,#63
661	ldp	x8,x9,[x1,#8*2]
662	adc	x26,x13,x16
663	ldp	x10,x11,[x1,#8*4]
664
665	// Reduce by 512 bits per iteration
666	mul	x28,x4,x19		// t[0]*n0
667	ldp	x12,x13,[x1,#8*6]
668	add	x3,x1,x5
669	ldp	x21,x22,[sp,#8*2]
670	stp	x23,x24,[x2,#8*4]
671	ldp	x23,x24,[sp,#8*4]
672	stp	x25,x26,[x2,#8*6]
673	ldp	x25,x26,[sp,#8*6]
674	add	x1,x1,#8*8
675	mov	x30,xzr		// initial top-most carry
676	mov	x2,sp
677	mov	x27,#8
678
679Lsqr8x_reduction:
680	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
681	mul	x15,x7,x28
682	sub	x27,x27,#1
683	mul	x16,x8,x28
684	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
685	mul	x17,x9,x28
686	// (*)	adds	xzr,x19,x14
687	subs	xzr,x19,#1		// (*)
688	mul	x14,x10,x28
689	adcs	x19,x20,x15
690	mul	x15,x11,x28
691	adcs	x20,x21,x16
692	mul	x16,x12,x28
693	adcs	x21,x22,x17
694	mul	x17,x13,x28
695	adcs	x22,x23,x14
696	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
697	adcs	x23,x24,x15
698	umulh	x15,x7,x28
699	adcs	x24,x25,x16
700	umulh	x16,x8,x28
701	adcs	x25,x26,x17
702	umulh	x17,x9,x28
703	adc	x26,xzr,xzr
704	adds	x19,x19,x14
705	umulh	x14,x10,x28
706	adcs	x20,x20,x15
707	umulh	x15,x11,x28
708	adcs	x21,x21,x16
709	umulh	x16,x12,x28
710	adcs	x22,x22,x17
711	umulh	x17,x13,x28
712	mul	x28,x4,x19		// next t[0]*n0
713	adcs	x23,x23,x14
714	adcs	x24,x24,x15
715	adcs	x25,x25,x16
716	adc	x26,x26,x17
717	cbnz	x27,Lsqr8x_reduction
718
719	ldp	x14,x15,[x2,#8*0]
720	ldp	x16,x17,[x2,#8*2]
721	mov	x0,x2
722	sub	x27,x3,x1	// done yet?
723	adds	x19,x19,x14
724	adcs	x20,x20,x15
725	ldp	x14,x15,[x2,#8*4]
726	adcs	x21,x21,x16
727	adcs	x22,x22,x17
728	ldp	x16,x17,[x2,#8*6]
729	adcs	x23,x23,x14
730	adcs	x24,x24,x15
731	adcs	x25,x25,x16
732	adcs	x26,x26,x17
733	//adc	x28,xzr,xzr		// moved below
734	cbz	x27,Lsqr8x8_post_condition
735
736	ldr	x4,[x2,#-8*8]
737	ldp	x6,x7,[x1,#8*0]
738	ldp	x8,x9,[x1,#8*2]
739	ldp	x10,x11,[x1,#8*4]
740	mov	x27,#-8*8
741	ldp	x12,x13,[x1,#8*6]
742	add	x1,x1,#8*8
743
744Lsqr8x_tail:
745	mul	x14,x6,x4
746	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
747	mul	x15,x7,x4
748	add	x27,x27,#8
749	mul	x16,x8,x4
750	mul	x17,x9,x4
751	adds	x19,x19,x14
752	mul	x14,x10,x4
753	adcs	x20,x20,x15
754	mul	x15,x11,x4
755	adcs	x21,x21,x16
756	mul	x16,x12,x4
757	adcs	x22,x22,x17
758	mul	x17,x13,x4
759	adcs	x23,x23,x14
760	umulh	x14,x6,x4
761	adcs	x24,x24,x15
762	umulh	x15,x7,x4
763	adcs	x25,x25,x16
764	umulh	x16,x8,x4
765	adcs	x26,x26,x17
766	umulh	x17,x9,x4
767	adc	x28,x28,xzr
768	str	x19,[x2],#8
769	adds	x19,x20,x14
770	umulh	x14,x10,x4
771	adcs	x20,x21,x15
772	umulh	x15,x11,x4
773	adcs	x21,x22,x16
774	umulh	x16,x12,x4
775	adcs	x22,x23,x17
776	umulh	x17,x13,x4
777	ldr	x4,[x0,x27]
778	adcs	x23,x24,x14
779	adcs	x24,x25,x15
780	adcs	x25,x26,x16
781	adcs	x26,x28,x17
782	//adc	x28,xzr,xzr		// moved above
783	cbnz	x27,Lsqr8x_tail
784					// note that carry flag is guaranteed
785					// to be zero at this point
786	ldp	x6,x7,[x2,#8*0]
787	sub	x27,x3,x1	// done yet?
788	sub	x16,x3,x5	// rewinded np
789	ldp	x8,x9,[x2,#8*2]
790	ldp	x10,x11,[x2,#8*4]
791	ldp	x12,x13,[x2,#8*6]
792	cbz	x27,Lsqr8x_tail_break
793
794	ldr	x4,[x0,#-8*8]
795	adds	x19,x19,x6
796	adcs	x20,x20,x7
797	ldp	x6,x7,[x1,#8*0]
798	adcs	x21,x21,x8
799	adcs	x22,x22,x9
800	ldp	x8,x9,[x1,#8*2]
801	adcs	x23,x23,x10
802	adcs	x24,x24,x11
803	ldp	x10,x11,[x1,#8*4]
804	adcs	x25,x25,x12
805	mov	x27,#-8*8
806	adcs	x26,x26,x13
807	ldp	x12,x13,[x1,#8*6]
808	add	x1,x1,#8*8
809	//adc	x28,xzr,xzr		// moved above
810	b	Lsqr8x_tail
811
812.align	4
813Lsqr8x_tail_break:
814	ldr	x4,[x29,#112]		// pull n0
815	add	x27,x2,#8*8		// end of current t[num] window
816
817	subs	xzr,x30,#1		// "move" top-most carry to carry bit
818	adcs	x14,x19,x6
819	adcs	x15,x20,x7
820	ldp	x19,x20,[x0,#8*0]
821	adcs	x21,x21,x8
822	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
823	adcs	x22,x22,x9
824	ldp	x8,x9,[x16,#8*2]
825	adcs	x23,x23,x10
826	adcs	x24,x24,x11
827	ldp	x10,x11,[x16,#8*4]
828	adcs	x25,x25,x12
829	adcs	x26,x26,x13
830	ldp	x12,x13,[x16,#8*6]
831	add	x1,x16,#8*8
832	adc	x30,xzr,xzr	// top-most carry
833	mul	x28,x4,x19
834	stp	x14,x15,[x2,#8*0]
835	stp	x21,x22,[x2,#8*2]
836	ldp	x21,x22,[x0,#8*2]
837	stp	x23,x24,[x2,#8*4]
838	ldp	x23,x24,[x0,#8*4]
839	cmp	x27,x29		// did we hit the bottom?
840	stp	x25,x26,[x2,#8*6]
841	mov	x2,x0			// slide the window
842	ldp	x25,x26,[x0,#8*6]
843	mov	x27,#8
844	b.ne	Lsqr8x_reduction
845
846	// Final step. We see if result is larger than modulus, and
847	// if it is, subtract the modulus. But comparison implies
848	// subtraction. So we subtract modulus, see if it borrowed,
849	// and conditionally copy original value.
850	ldr	x0,[x29,#96]		// pull rp
851	add	x2,x2,#8*8
852	subs	x14,x19,x6
853	sbcs	x15,x20,x7
854	sub	x27,x5,#8*8
855	mov	x3,x0		// x0 copy
856
857Lsqr8x_sub:
858	sbcs	x16,x21,x8
859	ldp	x6,x7,[x1,#8*0]
860	sbcs	x17,x22,x9
861	stp	x14,x15,[x0,#8*0]
862	sbcs	x14,x23,x10
863	ldp	x8,x9,[x1,#8*2]
864	sbcs	x15,x24,x11
865	stp	x16,x17,[x0,#8*2]
866	sbcs	x16,x25,x12
867	ldp	x10,x11,[x1,#8*4]
868	sbcs	x17,x26,x13
869	ldp	x12,x13,[x1,#8*6]
870	add	x1,x1,#8*8
871	ldp	x19,x20,[x2,#8*0]
872	sub	x27,x27,#8*8
873	ldp	x21,x22,[x2,#8*2]
874	ldp	x23,x24,[x2,#8*4]
875	ldp	x25,x26,[x2,#8*6]
876	add	x2,x2,#8*8
877	stp	x14,x15,[x0,#8*4]
878	sbcs	x14,x19,x6
879	stp	x16,x17,[x0,#8*6]
880	add	x0,x0,#8*8
881	sbcs	x15,x20,x7
882	cbnz	x27,Lsqr8x_sub
883
884	sbcs	x16,x21,x8
885	mov	x2,sp
886	add	x1,sp,x5
887	ldp	x6,x7,[x3,#8*0]
888	sbcs	x17,x22,x9
889	stp	x14,x15,[x0,#8*0]
890	sbcs	x14,x23,x10
891	ldp	x8,x9,[x3,#8*2]
892	sbcs	x15,x24,x11
893	stp	x16,x17,[x0,#8*2]
894	sbcs	x16,x25,x12
895	ldp	x19,x20,[x1,#8*0]
896	sbcs	x17,x26,x13
897	ldp	x21,x22,[x1,#8*2]
898	sbcs	xzr,x30,xzr	// did it borrow?
899	ldr	x30,[x29,#8]		// pull return address
900	stp	x14,x15,[x0,#8*4]
901	stp	x16,x17,[x0,#8*6]
902
903	sub	x27,x5,#8*4
904Lsqr4x_cond_copy:
905	sub	x27,x27,#8*4
906	csel	x14,x19,x6,lo
907	stp	xzr,xzr,[x2,#8*0]
908	csel	x15,x20,x7,lo
909	ldp	x6,x7,[x3,#8*4]
910	ldp	x19,x20,[x1,#8*4]
911	csel	x16,x21,x8,lo
912	stp	xzr,xzr,[x2,#8*2]
913	add	x2,x2,#8*4
914	csel	x17,x22,x9,lo
915	ldp	x8,x9,[x3,#8*6]
916	ldp	x21,x22,[x1,#8*6]
917	add	x1,x1,#8*4
918	stp	x14,x15,[x3,#8*0]
919	stp	x16,x17,[x3,#8*2]
920	add	x3,x3,#8*4
921	stp	xzr,xzr,[x1,#8*0]
922	stp	xzr,xzr,[x1,#8*2]
923	cbnz	x27,Lsqr4x_cond_copy
924
925	csel	x14,x19,x6,lo
926	stp	xzr,xzr,[x2,#8*0]
927	csel	x15,x20,x7,lo
928	stp	xzr,xzr,[x2,#8*2]
929	csel	x16,x21,x8,lo
930	csel	x17,x22,x9,lo
931	stp	x14,x15,[x3,#8*0]
932	stp	x16,x17,[x3,#8*2]
933
934	b	Lsqr8x_done
935
936.align	4
937Lsqr8x8_post_condition:
938	adc	x28,xzr,xzr
939	ldr	x30,[x29,#8]		// pull return address
940	// x19-7,x28 hold result, x6-7 hold modulus
941	subs	x6,x19,x6
942	ldr	x1,[x29,#96]		// pull rp
943	sbcs	x7,x20,x7
944	stp	xzr,xzr,[sp,#8*0]
945	sbcs	x8,x21,x8
946	stp	xzr,xzr,[sp,#8*2]
947	sbcs	x9,x22,x9
948	stp	xzr,xzr,[sp,#8*4]
949	sbcs	x10,x23,x10
950	stp	xzr,xzr,[sp,#8*6]
951	sbcs	x11,x24,x11
952	stp	xzr,xzr,[sp,#8*8]
953	sbcs	x12,x25,x12
954	stp	xzr,xzr,[sp,#8*10]
955	sbcs	x13,x26,x13
956	stp	xzr,xzr,[sp,#8*12]
957	sbcs	x28,x28,xzr	// did it borrow?
958	stp	xzr,xzr,[sp,#8*14]
959
960	// x6-7 hold result-modulus
961	csel	x6,x19,x6,lo
962	csel	x7,x20,x7,lo
963	csel	x8,x21,x8,lo
964	csel	x9,x22,x9,lo
965	stp	x6,x7,[x1,#8*0]
966	csel	x10,x23,x10,lo
967	csel	x11,x24,x11,lo
968	stp	x8,x9,[x1,#8*2]
969	csel	x12,x25,x12,lo
970	csel	x13,x26,x13,lo
971	stp	x10,x11,[x1,#8*4]
972	stp	x12,x13,[x1,#8*6]
973
974Lsqr8x_done:
975	ldp	x19,x20,[x29,#16]
976	mov	sp,x29
977	ldp	x21,x22,[x29,#32]
978	mov	x0,#1
979	ldp	x23,x24,[x29,#48]
980	ldp	x25,x26,[x29,#64]
981	ldp	x27,x28,[x29,#80]
982	ldr	x29,[sp],#128
983	// x30 is popped earlier
984	AARCH64_VALIDATE_LINK_REGISTER
985	ret
986
987
988.align	5
989__bn_mul4x_mont:
990	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
991	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
992	// return address.
993	stp	x29,x30,[sp,#-128]!
994	add	x29,sp,#0
995	stp	x19,x20,[sp,#16]
996	stp	x21,x22,[sp,#32]
997	stp	x23,x24,[sp,#48]
998	stp	x25,x26,[sp,#64]
999	stp	x27,x28,[sp,#80]
1000
1001	sub	x26,sp,x5,lsl#3
1002	lsl	x5,x5,#3
1003	ldr	x4,[x4]		// *n0
1004	sub	sp,x26,#8*4		// alloca
1005
1006	add	x10,x2,x5
1007	add	x27,x1,x5
1008	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1009
1010	ldr	x24,[x2,#8*0]		// b[0]
1011	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1012	ldp	x8,x9,[x1,#8*2]
1013	add	x1,x1,#8*4
1014	mov	x19,xzr
1015	mov	x20,xzr
1016	mov	x21,xzr
1017	mov	x22,xzr
1018	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1019	ldp	x16,x17,[x3,#8*2]
1020	adds	x3,x3,#8*4		// clear carry bit
1021	mov	x0,xzr
1022	mov	x28,#0
1023	mov	x26,sp
1024
1025Loop_mul4x_1st_reduction:
1026	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1027	adc	x0,x0,xzr	// modulo-scheduled
1028	mul	x11,x7,x24
1029	add	x28,x28,#8
1030	mul	x12,x8,x24
1031	and	x28,x28,#31
1032	mul	x13,x9,x24
1033	adds	x19,x19,x10
1034	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1035	adcs	x20,x20,x11
1036	mul	x25,x19,x4		// t[0]*n0
1037	adcs	x21,x21,x12
1038	umulh	x11,x7,x24
1039	adcs	x22,x22,x13
1040	umulh	x12,x8,x24
1041	adc	x23,xzr,xzr
1042	umulh	x13,x9,x24
1043	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1044	adds	x20,x20,x10
1045	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1046	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1047	adcs	x21,x21,x11
1048	mul	x11,x15,x25
1049	adcs	x22,x22,x12
1050	mul	x12,x16,x25
1051	adc	x23,x23,x13		// can't overflow
1052	mul	x13,x17,x25
1053	// (*)	adds	xzr,x19,x10
1054	subs	xzr,x19,#1		// (*)
1055	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1056	adcs	x19,x20,x11
1057	umulh	x11,x15,x25
1058	adcs	x20,x21,x12
1059	umulh	x12,x16,x25
1060	adcs	x21,x22,x13
1061	umulh	x13,x17,x25
1062	adcs	x22,x23,x0
1063	adc	x0,xzr,xzr
1064	adds	x19,x19,x10
1065	sub	x10,x27,x1
1066	adcs	x20,x20,x11
1067	adcs	x21,x21,x12
1068	adcs	x22,x22,x13
1069	//adc	x0,x0,xzr
1070	cbnz	x28,Loop_mul4x_1st_reduction
1071
1072	cbz	x10,Lmul4x4_post_condition
1073
1074	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1075	ldp	x8,x9,[x1,#8*2]
1076	add	x1,x1,#8*4
1077	ldr	x25,[sp]		// a[0]*n0
1078	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1079	ldp	x16,x17,[x3,#8*2]
1080	add	x3,x3,#8*4
1081
1082Loop_mul4x_1st_tail:
1083	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1084	adc	x0,x0,xzr	// modulo-scheduled
1085	mul	x11,x7,x24
1086	add	x28,x28,#8
1087	mul	x12,x8,x24
1088	and	x28,x28,#31
1089	mul	x13,x9,x24
1090	adds	x19,x19,x10
1091	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1092	adcs	x20,x20,x11
1093	umulh	x11,x7,x24
1094	adcs	x21,x21,x12
1095	umulh	x12,x8,x24
1096	adcs	x22,x22,x13
1097	umulh	x13,x9,x24
1098	adc	x23,xzr,xzr
1099	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1100	adds	x20,x20,x10
1101	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1102	adcs	x21,x21,x11
1103	mul	x11,x15,x25
1104	adcs	x22,x22,x12
1105	mul	x12,x16,x25
1106	adc	x23,x23,x13		// can't overflow
1107	mul	x13,x17,x25
1108	adds	x19,x19,x10
1109	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1110	adcs	x20,x20,x11
1111	umulh	x11,x15,x25
1112	adcs	x21,x21,x12
1113	umulh	x12,x16,x25
1114	adcs	x22,x22,x13
1115	adcs	x23,x23,x0
1116	umulh	x13,x17,x25
1117	adc	x0,xzr,xzr
1118	ldr	x25,[sp,x28]		// next t[0]*n0
1119	str	x19,[x26],#8		// result!!!
1120	adds	x19,x20,x10
1121	sub	x10,x27,x1		// done yet?
1122	adcs	x20,x21,x11
1123	adcs	x21,x22,x12
1124	adcs	x22,x23,x13
1125	//adc	x0,x0,xzr
1126	cbnz	x28,Loop_mul4x_1st_tail
1127
1128	sub	x11,x27,x5	// rewinded x1
1129	cbz	x10,Lmul4x_proceed
1130
1131	ldp	x6,x7,[x1,#8*0]
1132	ldp	x8,x9,[x1,#8*2]
1133	add	x1,x1,#8*4
1134	ldp	x14,x15,[x3,#8*0]
1135	ldp	x16,x17,[x3,#8*2]
1136	add	x3,x3,#8*4
1137	b	Loop_mul4x_1st_tail
1138
1139.align	5
1140Lmul4x_proceed:
1141	ldr	x24,[x2,#8*4]!		// *++b
1142	adc	x30,x0,xzr
1143	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1144	sub	x3,x3,x5		// rewind np
1145	ldp	x8,x9,[x11,#8*2]
1146	add	x1,x11,#8*4
1147
1148	stp	x19,x20,[x26,#8*0]	// result!!!
1149	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1150	stp	x21,x22,[x26,#8*2]	// result!!!
1151	ldp	x21,x22,[sp,#8*6]
1152
1153	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1154	mov	x26,sp
1155	ldp	x16,x17,[x3,#8*2]
1156	adds	x3,x3,#8*4		// clear carry bit
1157	mov	x0,xzr
1158
1159.align	4
1160Loop_mul4x_reduction:
1161	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1162	adc	x0,x0,xzr	// modulo-scheduled
1163	mul	x11,x7,x24
1164	add	x28,x28,#8
1165	mul	x12,x8,x24
1166	and	x28,x28,#31
1167	mul	x13,x9,x24
1168	adds	x19,x19,x10
1169	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1170	adcs	x20,x20,x11
1171	mul	x25,x19,x4		// t[0]*n0
1172	adcs	x21,x21,x12
1173	umulh	x11,x7,x24
1174	adcs	x22,x22,x13
1175	umulh	x12,x8,x24
1176	adc	x23,xzr,xzr
1177	umulh	x13,x9,x24
1178	ldr	x24,[x2,x28]		// next b[i]
1179	adds	x20,x20,x10
1180	// (*)	mul	x10,x14,x25
1181	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1182	adcs	x21,x21,x11
1183	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1184	adcs	x22,x22,x12
1185	mul	x12,x16,x25
1186	adc	x23,x23,x13		// can't overflow
1187	mul	x13,x17,x25
1188	// (*)	adds	xzr,x19,x10
1189	subs	xzr,x19,#1		// (*)
1190	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1191	adcs	x19,x20,x11
1192	umulh	x11,x15,x25
1193	adcs	x20,x21,x12
1194	umulh	x12,x16,x25
1195	adcs	x21,x22,x13
1196	umulh	x13,x17,x25
1197	adcs	x22,x23,x0
1198	adc	x0,xzr,xzr
1199	adds	x19,x19,x10
1200	adcs	x20,x20,x11
1201	adcs	x21,x21,x12
1202	adcs	x22,x22,x13
1203	//adc	x0,x0,xzr
1204	cbnz	x28,Loop_mul4x_reduction
1205
1206	adc	x0,x0,xzr
1207	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1208	ldp	x12,x13,[x26,#8*6]
1209	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1210	ldp	x8,x9,[x1,#8*2]
1211	add	x1,x1,#8*4
1212	adds	x19,x19,x10
1213	adcs	x20,x20,x11
1214	adcs	x21,x21,x12
1215	adcs	x22,x22,x13
1216	//adc	x0,x0,xzr
1217
1218	ldr	x25,[sp]		// t[0]*n0
1219	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1220	ldp	x16,x17,[x3,#8*2]
1221	add	x3,x3,#8*4
1222
1223.align	4
1224Loop_mul4x_tail:
1225	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1226	adc	x0,x0,xzr	// modulo-scheduled
1227	mul	x11,x7,x24
1228	add	x28,x28,#8
1229	mul	x12,x8,x24
1230	and	x28,x28,#31
1231	mul	x13,x9,x24
1232	adds	x19,x19,x10
1233	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1234	adcs	x20,x20,x11
1235	umulh	x11,x7,x24
1236	adcs	x21,x21,x12
1237	umulh	x12,x8,x24
1238	adcs	x22,x22,x13
1239	umulh	x13,x9,x24
1240	adc	x23,xzr,xzr
1241	ldr	x24,[x2,x28]		// next b[i]
1242	adds	x20,x20,x10
1243	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1244	adcs	x21,x21,x11
1245	mul	x11,x15,x25
1246	adcs	x22,x22,x12
1247	mul	x12,x16,x25
1248	adc	x23,x23,x13		// can't overflow
1249	mul	x13,x17,x25
1250	adds	x19,x19,x10
1251	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1252	adcs	x20,x20,x11
1253	umulh	x11,x15,x25
1254	adcs	x21,x21,x12
1255	umulh	x12,x16,x25
1256	adcs	x22,x22,x13
1257	umulh	x13,x17,x25
1258	adcs	x23,x23,x0
1259	ldr	x25,[sp,x28]		// next a[0]*n0
1260	adc	x0,xzr,xzr
1261	str	x19,[x26],#8		// result!!!
1262	adds	x19,x20,x10
1263	sub	x10,x27,x1		// done yet?
1264	adcs	x20,x21,x11
1265	adcs	x21,x22,x12
1266	adcs	x22,x23,x13
1267	//adc	x0,x0,xzr
1268	cbnz	x28,Loop_mul4x_tail
1269
1270	sub	x11,x3,x5		// rewinded np?
1271	adc	x0,x0,xzr
1272	cbz	x10,Loop_mul4x_break
1273
1274	ldp	x10,x11,[x26,#8*4]
1275	ldp	x12,x13,[x26,#8*6]
1276	ldp	x6,x7,[x1,#8*0]
1277	ldp	x8,x9,[x1,#8*2]
1278	add	x1,x1,#8*4
1279	adds	x19,x19,x10
1280	adcs	x20,x20,x11
1281	adcs	x21,x21,x12
1282	adcs	x22,x22,x13
1283	//adc	x0,x0,xzr
1284	ldp	x14,x15,[x3,#8*0]
1285	ldp	x16,x17,[x3,#8*2]
1286	add	x3,x3,#8*4
1287	b	Loop_mul4x_tail
1288
1289.align	4
1290Loop_mul4x_break:
1291	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1292	adds	x19,x19,x30
1293	add	x2,x2,#8*4		// bp++
1294	adcs	x20,x20,xzr
1295	sub	x1,x1,x5		// rewind ap
1296	adcs	x21,x21,xzr
1297	stp	x19,x20,[x26,#8*0]	// result!!!
1298	adcs	x22,x22,xzr
1299	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1300	adc	x30,x0,xzr
1301	stp	x21,x22,[x26,#8*2]	// result!!!
1302	cmp	x2,x13			// done yet?
1303	ldp	x21,x22,[sp,#8*6]
1304	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1305	ldp	x16,x17,[x11,#8*2]
1306	add	x3,x11,#8*4
1307	b.eq	Lmul4x_post
1308
1309	ldr	x24,[x2]
1310	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1311	ldp	x8,x9,[x1,#8*2]
1312	adds	x1,x1,#8*4		// clear carry bit
1313	mov	x0,xzr
1314	mov	x26,sp
1315	b	Loop_mul4x_reduction
1316
1317.align	4
1318Lmul4x_post:
1319	// Final step. We see if result is larger than modulus, and
1320	// if it is, subtract the modulus. But comparison implies
1321	// subtraction. So we subtract modulus, see if it borrowed,
1322	// and conditionally copy original value.
1323	mov	x0,x12
1324	mov	x27,x12		// x0 copy
1325	subs	x10,x19,x14
1326	add	x26,sp,#8*8
1327	sbcs	x11,x20,x15
1328	sub	x28,x5,#8*4
1329
1330Lmul4x_sub:
1331	sbcs	x12,x21,x16
1332	ldp	x14,x15,[x3,#8*0]
1333	sub	x28,x28,#8*4
1334	ldp	x19,x20,[x26,#8*0]
1335	sbcs	x13,x22,x17
1336	ldp	x16,x17,[x3,#8*2]
1337	add	x3,x3,#8*4
1338	ldp	x21,x22,[x26,#8*2]
1339	add	x26,x26,#8*4
1340	stp	x10,x11,[x0,#8*0]
1341	sbcs	x10,x19,x14
1342	stp	x12,x13,[x0,#8*2]
1343	add	x0,x0,#8*4
1344	sbcs	x11,x20,x15
1345	cbnz	x28,Lmul4x_sub
1346
1347	sbcs	x12,x21,x16
1348	mov	x26,sp
1349	add	x1,sp,#8*4
1350	ldp	x6,x7,[x27,#8*0]
1351	sbcs	x13,x22,x17
1352	stp	x10,x11,[x0,#8*0]
1353	ldp	x8,x9,[x27,#8*2]
1354	stp	x12,x13,[x0,#8*2]
1355	ldp	x19,x20,[x1,#8*0]
1356	ldp	x21,x22,[x1,#8*2]
1357	sbcs	xzr,x30,xzr	// did it borrow?
1358	ldr	x30,[x29,#8]		// pull return address
1359
1360	sub	x28,x5,#8*4
1361Lmul4x_cond_copy:
1362	sub	x28,x28,#8*4
1363	csel	x10,x19,x6,lo
1364	stp	xzr,xzr,[x26,#8*0]
1365	csel	x11,x20,x7,lo
1366	ldp	x6,x7,[x27,#8*4]
1367	ldp	x19,x20,[x1,#8*4]
1368	csel	x12,x21,x8,lo
1369	stp	xzr,xzr,[x26,#8*2]
1370	add	x26,x26,#8*4
1371	csel	x13,x22,x9,lo
1372	ldp	x8,x9,[x27,#8*6]
1373	ldp	x21,x22,[x1,#8*6]
1374	add	x1,x1,#8*4
1375	stp	x10,x11,[x27,#8*0]
1376	stp	x12,x13,[x27,#8*2]
1377	add	x27,x27,#8*4
1378	cbnz	x28,Lmul4x_cond_copy
1379
1380	csel	x10,x19,x6,lo
1381	stp	xzr,xzr,[x26,#8*0]
1382	csel	x11,x20,x7,lo
1383	stp	xzr,xzr,[x26,#8*2]
1384	csel	x12,x21,x8,lo
1385	stp	xzr,xzr,[x26,#8*3]
1386	csel	x13,x22,x9,lo
1387	stp	xzr,xzr,[x26,#8*4]
1388	stp	x10,x11,[x27,#8*0]
1389	stp	x12,x13,[x27,#8*2]
1390
1391	b	Lmul4x_done
1392
1393.align	4
1394Lmul4x4_post_condition:
1395	adc	x0,x0,xzr
1396	ldr	x1,[x29,#96]		// pull rp
1397	// x19-3,x0 hold result, x14-7 hold modulus
1398	subs	x6,x19,x14
1399	ldr	x30,[x29,#8]		// pull return address
1400	sbcs	x7,x20,x15
1401	stp	xzr,xzr,[sp,#8*0]
1402	sbcs	x8,x21,x16
1403	stp	xzr,xzr,[sp,#8*2]
1404	sbcs	x9,x22,x17
1405	stp	xzr,xzr,[sp,#8*4]
1406	sbcs	xzr,x0,xzr		// did it borrow?
1407	stp	xzr,xzr,[sp,#8*6]
1408
1409	// x6-3 hold result-modulus
1410	csel	x6,x19,x6,lo
1411	csel	x7,x20,x7,lo
1412	csel	x8,x21,x8,lo
1413	csel	x9,x22,x9,lo
1414	stp	x6,x7,[x1,#8*0]
1415	stp	x8,x9,[x1,#8*2]
1416
1417Lmul4x_done:
1418	ldp	x19,x20,[x29,#16]
1419	mov	sp,x29
1420	ldp	x21,x22,[x29,#32]
1421	mov	x0,#1
1422	ldp	x23,x24,[x29,#48]
1423	ldp	x25,x26,[x29,#64]
1424	ldp	x27,x28,[x29,#80]
1425	ldr	x29,[sp],#128
1426	// x30 is popped earlier
1427	AARCH64_VALIDATE_LINK_REGISTER
1428	ret
1429
1430.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1431.align	2
1432.align	4
1433#endif  // !OPENSSL_NO_ASM
1434