• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#include <GFp/arm_arch.h>
13
14.text
15
16.globl	_GFp_bn_mul_mont
17.private_extern	_GFp_bn_mul_mont
18
19.align	5
20_GFp_bn_mul_mont:
21	AARCH64_SIGN_LINK_REGISTER
22	tst	x5,#7
23	b.eq	__bn_sqr8x_mont
24	tst	x5,#3
25	b.eq	__bn_mul4x_mont
26Lmul_mont:
27	stp	x29,x30,[sp,#-64]!
28	add	x29,sp,#0
29	stp	x19,x20,[sp,#16]
30	stp	x21,x22,[sp,#32]
31	stp	x23,x24,[sp,#48]
32
33	ldr	x9,[x2],#8		// bp[0]
34	sub	x22,sp,x5,lsl#3
35	ldp	x7,x8,[x1],#16	// ap[0..1]
36	lsl	x5,x5,#3
37	ldr	x4,[x4]		// *n0
38	and	x22,x22,#-16		// ABI says so
39	ldp	x13,x14,[x3],#16	// np[0..1]
40
41	mul	x6,x7,x9		// ap[0]*bp[0]
42	sub	x21,x5,#16		// j=num-2
43	umulh	x7,x7,x9
44	mul	x10,x8,x9		// ap[1]*bp[0]
45	umulh	x11,x8,x9
46
47	mul	x15,x6,x4		// "tp[0]"*n0
48	mov	sp,x22			// alloca
49
50	// (*)	mul	x12,x13,x15	// np[0]*m1
51	umulh	x13,x13,x15
52	mul	x16,x14,x15		// np[1]*m1
53	// (*)	adds	x12,x12,x6	// discarded
54	// (*)	As for removal of first multiplication and addition
55	//	instructions. The outcome of first addition is
56	//	guaranteed to be zero, which leaves two computationally
57	//	significant outcomes: it either carries or not. Then
58	//	question is when does it carry? Is there alternative
59	//	way to deduce it? If you follow operations, you can
60	//	observe that condition for carry is quite simple:
61	//	x6 being non-zero. So that carry can be calculated
62	//	by adding -1 to x6. That's what next instruction does.
63	subs	xzr,x6,#1		// (*)
64	umulh	x17,x14,x15
65	adc	x13,x13,xzr
66	cbz	x21,L1st_skip
67
68L1st:
69	ldr	x8,[x1],#8
70	adds	x6,x10,x7
71	sub	x21,x21,#8		// j--
72	adc	x7,x11,xzr
73
74	ldr	x14,[x3],#8
75	adds	x12,x16,x13
76	mul	x10,x8,x9		// ap[j]*bp[0]
77	adc	x13,x17,xzr
78	umulh	x11,x8,x9
79
80	adds	x12,x12,x6
81	mul	x16,x14,x15		// np[j]*m1
82	adc	x13,x13,xzr
83	umulh	x17,x14,x15
84	str	x12,[x22],#8		// tp[j-1]
85	cbnz	x21,L1st
86
87L1st_skip:
88	adds	x6,x10,x7
89	sub	x1,x1,x5		// rewind x1
90	adc	x7,x11,xzr
91
92	adds	x12,x16,x13
93	sub	x3,x3,x5		// rewind x3
94	adc	x13,x17,xzr
95
96	adds	x12,x12,x6
97	sub	x20,x5,#8		// i=num-1
98	adcs	x13,x13,x7
99
100	adc	x19,xzr,xzr		// upmost overflow bit
101	stp	x12,x13,[x22]
102
103Louter:
104	ldr	x9,[x2],#8		// bp[i]
105	ldp	x7,x8,[x1],#16
106	ldr	x23,[sp]		// tp[0]
107	add	x22,sp,#8
108
109	mul	x6,x7,x9		// ap[0]*bp[i]
110	sub	x21,x5,#16		// j=num-2
111	umulh	x7,x7,x9
112	ldp	x13,x14,[x3],#16
113	mul	x10,x8,x9		// ap[1]*bp[i]
114	adds	x6,x6,x23
115	umulh	x11,x8,x9
116	adc	x7,x7,xzr
117
118	mul	x15,x6,x4
119	sub	x20,x20,#8		// i--
120
121	// (*)	mul	x12,x13,x15	// np[0]*m1
122	umulh	x13,x13,x15
123	mul	x16,x14,x15		// np[1]*m1
124	// (*)	adds	x12,x12,x6
125	subs	xzr,x6,#1		// (*)
126	umulh	x17,x14,x15
127	cbz	x21,Linner_skip
128
129Linner:
130	ldr	x8,[x1],#8
131	adc	x13,x13,xzr
132	ldr	x23,[x22],#8		// tp[j]
133	adds	x6,x10,x7
134	sub	x21,x21,#8		// j--
135	adc	x7,x11,xzr
136
137	adds	x12,x16,x13
138	ldr	x14,[x3],#8
139	adc	x13,x17,xzr
140
141	mul	x10,x8,x9		// ap[j]*bp[i]
142	adds	x6,x6,x23
143	umulh	x11,x8,x9
144	adc	x7,x7,xzr
145
146	mul	x16,x14,x15		// np[j]*m1
147	adds	x12,x12,x6
148	umulh	x17,x14,x15
149	str	x12,[x22,#-16]		// tp[j-1]
150	cbnz	x21,Linner
151
152Linner_skip:
153	ldr	x23,[x22],#8		// tp[j]
154	adc	x13,x13,xzr
155	adds	x6,x10,x7
156	sub	x1,x1,x5		// rewind x1
157	adc	x7,x11,xzr
158
159	adds	x12,x16,x13
160	sub	x3,x3,x5		// rewind x3
161	adcs	x13,x17,x19
162	adc	x19,xzr,xzr
163
164	adds	x6,x6,x23
165	adc	x7,x7,xzr
166
167	adds	x12,x12,x6
168	adcs	x13,x13,x7
169	adc	x19,x19,xzr		// upmost overflow bit
170	stp	x12,x13,[x22,#-16]
171
172	cbnz	x20,Louter
173
174	// Final step. We see if result is larger than modulus, and
175	// if it is, subtract the modulus. But comparison implies
176	// subtraction. So we subtract modulus, see if it borrowed,
177	// and conditionally copy original value.
178	ldr	x23,[sp]		// tp[0]
179	add	x22,sp,#8
180	ldr	x14,[x3],#8		// np[0]
181	subs	x21,x5,#8		// j=num-1 and clear borrow
182	mov	x1,x0
183Lsub:
184	sbcs	x8,x23,x14		// tp[j]-np[j]
185	ldr	x23,[x22],#8
186	sub	x21,x21,#8		// j--
187	ldr	x14,[x3],#8
188	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
189	cbnz	x21,Lsub
190
191	sbcs	x8,x23,x14
192	sbcs	x19,x19,xzr		// did it borrow?
193	str	x8,[x1],#8		// rp[num-1]
194
195	ldr	x23,[sp]		// tp[0]
196	add	x22,sp,#8
197	ldr	x8,[x0],#8		// rp[0]
198	sub	x5,x5,#8		// num--
199	nop
200Lcond_copy:
201	sub	x5,x5,#8		// num--
202	csel	x14,x23,x8,lo		// did it borrow?
203	ldr	x23,[x22],#8
204	ldr	x8,[x0],#8
205	str	xzr,[x22,#-16]		// wipe tp
206	str	x14,[x0,#-16]
207	cbnz	x5,Lcond_copy
208
209	csel	x14,x23,x8,lo
210	str	xzr,[x22,#-8]		// wipe tp
211	str	x14,[x0,#-8]
212
213	ldp	x19,x20,[x29,#16]
214	mov	sp,x29
215	ldp	x21,x22,[x29,#32]
216	mov	x0,#1
217	ldp	x23,x24,[x29,#48]
218	ldr	x29,[sp],#64
219	AARCH64_VALIDATE_LINK_REGISTER
220	ret
221
222
223.align	5
224__bn_sqr8x_mont:
225	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
226	// only from bn_mul_mont which has already signed the return address.
227	cmp	x1,x2
228	b.ne	__bn_mul4x_mont
229Lsqr8x_mont:
230	stp	x29,x30,[sp,#-128]!
231	add	x29,sp,#0
232	stp	x19,x20,[sp,#16]
233	stp	x21,x22,[sp,#32]
234	stp	x23,x24,[sp,#48]
235	stp	x25,x26,[sp,#64]
236	stp	x27,x28,[sp,#80]
237	stp	x0,x3,[sp,#96]	// offload rp and np
238
239	ldp	x6,x7,[x1,#8*0]
240	ldp	x8,x9,[x1,#8*2]
241	ldp	x10,x11,[x1,#8*4]
242	ldp	x12,x13,[x1,#8*6]
243
244	sub	x2,sp,x5,lsl#4
245	lsl	x5,x5,#3
246	ldr	x4,[x4]		// *n0
247	mov	sp,x2			// alloca
248	sub	x27,x5,#8*8
249	b	Lsqr8x_zero_start
250
251Lsqr8x_zero:
252	sub	x27,x27,#8*8
253	stp	xzr,xzr,[x2,#8*0]
254	stp	xzr,xzr,[x2,#8*2]
255	stp	xzr,xzr,[x2,#8*4]
256	stp	xzr,xzr,[x2,#8*6]
257Lsqr8x_zero_start:
258	stp	xzr,xzr,[x2,#8*8]
259	stp	xzr,xzr,[x2,#8*10]
260	stp	xzr,xzr,[x2,#8*12]
261	stp	xzr,xzr,[x2,#8*14]
262	add	x2,x2,#8*16
263	cbnz	x27,Lsqr8x_zero
264
265	add	x3,x1,x5
266	add	x1,x1,#8*8
267	mov	x19,xzr
268	mov	x20,xzr
269	mov	x21,xzr
270	mov	x22,xzr
271	mov	x23,xzr
272	mov	x24,xzr
273	mov	x25,xzr
274	mov	x26,xzr
275	mov	x2,sp
276	str	x4,[x29,#112]		// offload n0
277
278	// Multiply everything but a[i]*a[i]
279.align	4
280Lsqr8x_outer_loop:
281        //                                                 a[1]a[0]	(i)
282        //                                             a[2]a[0]
283        //                                         a[3]a[0]
284        //                                     a[4]a[0]
285        //                                 a[5]a[0]
286        //                             a[6]a[0]
287        //                         a[7]a[0]
288        //                                         a[2]a[1]		(ii)
289        //                                     a[3]a[1]
290        //                                 a[4]a[1]
291        //                             a[5]a[1]
292        //                         a[6]a[1]
293        //                     a[7]a[1]
294        //                                 a[3]a[2]			(iii)
295        //                             a[4]a[2]
296        //                         a[5]a[2]
297        //                     a[6]a[2]
298        //                 a[7]a[2]
299        //                         a[4]a[3]				(iv)
300        //                     a[5]a[3]
301        //                 a[6]a[3]
302        //             a[7]a[3]
303        //                 a[5]a[4]					(v)
304        //             a[6]a[4]
305        //         a[7]a[4]
306        //         a[6]a[5]						(vi)
307        //     a[7]a[5]
308        // a[7]a[6]							(vii)
309
310	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
311	mul	x15,x8,x6
312	mul	x16,x9,x6
313	mul	x17,x10,x6
314	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
315	mul	x14,x11,x6
316	adcs	x21,x21,x15
317	mul	x15,x12,x6
318	adcs	x22,x22,x16
319	mul	x16,x13,x6
320	adcs	x23,x23,x17
321	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
322	adcs	x24,x24,x14
323	umulh	x14,x8,x6
324	adcs	x25,x25,x15
325	umulh	x15,x9,x6
326	adcs	x26,x26,x16
327	umulh	x16,x10,x6
328	stp	x19,x20,[x2],#8*2	// t[0..1]
329	adc	x19,xzr,xzr		// t[8]
330	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
331	umulh	x17,x11,x6
332	adcs	x22,x22,x14
333	umulh	x14,x12,x6
334	adcs	x23,x23,x15
335	umulh	x15,x13,x6
336	adcs	x24,x24,x16
337	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
338	adcs	x25,x25,x17
339	mul	x17,x9,x7
340	adcs	x26,x26,x14
341	mul	x14,x10,x7
342	adc	x19,x19,x15
343
344	mul	x15,x11,x7
345	adds	x22,x22,x16
346	mul	x16,x12,x7
347	adcs	x23,x23,x17
348	mul	x17,x13,x7
349	adcs	x24,x24,x14
350	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
351	adcs	x25,x25,x15
352	umulh	x15,x9,x7
353	adcs	x26,x26,x16
354	umulh	x16,x10,x7
355	adcs	x19,x19,x17
356	umulh	x17,x11,x7
357	stp	x21,x22,[x2],#8*2	// t[2..3]
358	adc	x20,xzr,xzr		// t[9]
359	adds	x23,x23,x14
360	umulh	x14,x12,x7
361	adcs	x24,x24,x15
362	umulh	x15,x13,x7
363	adcs	x25,x25,x16
364	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
365	adcs	x26,x26,x17
366	mul	x17,x10,x8
367	adcs	x19,x19,x14
368	mul	x14,x11,x8
369	adc	x20,x20,x15
370
371	mul	x15,x12,x8
372	adds	x24,x24,x16
373	mul	x16,x13,x8
374	adcs	x25,x25,x17
375	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
376	adcs	x26,x26,x14
377	umulh	x14,x10,x8
378	adcs	x19,x19,x15
379	umulh	x15,x11,x8
380	adcs	x20,x20,x16
381	umulh	x16,x12,x8
382	stp	x23,x24,[x2],#8*2	// t[4..5]
383	adc	x21,xzr,xzr		// t[10]
384	adds	x25,x25,x17
385	umulh	x17,x13,x8
386	adcs	x26,x26,x14
387	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
388	adcs	x19,x19,x15
389	mul	x15,x11,x9
390	adcs	x20,x20,x16
391	mul	x16,x12,x9
392	adc	x21,x21,x17
393
394	mul	x17,x13,x9
395	adds	x26,x26,x14
396	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
397	adcs	x19,x19,x15
398	umulh	x15,x11,x9
399	adcs	x20,x20,x16
400	umulh	x16,x12,x9
401	adcs	x21,x21,x17
402	umulh	x17,x13,x9
403	stp	x25,x26,[x2],#8*2	// t[6..7]
404	adc	x22,xzr,xzr		// t[11]
405	adds	x19,x19,x14
406	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
407	adcs	x20,x20,x15
408	mul	x15,x12,x10
409	adcs	x21,x21,x16
410	mul	x16,x13,x10
411	adc	x22,x22,x17
412
413	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
414	adds	x20,x20,x14
415	umulh	x14,x12,x10
416	adcs	x21,x21,x15
417	umulh	x15,x13,x10
418	adcs	x22,x22,x16
419	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
420	adc	x23,xzr,xzr		// t[12]
421	adds	x21,x21,x17
422	mul	x17,x13,x11
423	adcs	x22,x22,x14
424	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
425	adc	x23,x23,x15
426
427	umulh	x15,x13,x11
428	adds	x22,x22,x16
429	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
430	adcs	x23,x23,x17
431	umulh	x17,x13,x12		// hi(a[7]*a[6])
432	adc	x24,xzr,xzr		// t[13]
433	adds	x23,x23,x14
434	sub	x27,x3,x1	// done yet?
435	adc	x24,x24,x15
436
437	adds	x24,x24,x16
438	sub	x14,x3,x5	// rewinded ap
439	adc	x25,xzr,xzr		// t[14]
440	add	x25,x25,x17
441
442	cbz	x27,Lsqr8x_outer_break
443
444	mov	x4,x6
445	ldp	x6,x7,[x2,#8*0]
446	ldp	x8,x9,[x2,#8*2]
447	ldp	x10,x11,[x2,#8*4]
448	ldp	x12,x13,[x2,#8*6]
449	adds	x19,x19,x6
450	adcs	x20,x20,x7
451	ldp	x6,x7,[x1,#8*0]
452	adcs	x21,x21,x8
453	adcs	x22,x22,x9
454	ldp	x8,x9,[x1,#8*2]
455	adcs	x23,x23,x10
456	adcs	x24,x24,x11
457	ldp	x10,x11,[x1,#8*4]
458	adcs	x25,x25,x12
459	mov	x0,x1
460	adcs	x26,xzr,x13
461	ldp	x12,x13,[x1,#8*6]
462	add	x1,x1,#8*8
463	//adc	x28,xzr,xzr		// moved below
464	mov	x27,#-8*8
465
466	//                                                         a[8]a[0]
467	//                                                     a[9]a[0]
468	//                                                 a[a]a[0]
469	//                                             a[b]a[0]
470	//                                         a[c]a[0]
471	//                                     a[d]a[0]
472	//                                 a[e]a[0]
473	//                             a[f]a[0]
474	//                                                     a[8]a[1]
475	//                         a[f]a[1]........................
476	//                                                 a[8]a[2]
477	//                     a[f]a[2]........................
478	//                                             a[8]a[3]
479	//                 a[f]a[3]........................
480	//                                         a[8]a[4]
481	//             a[f]a[4]........................
482	//                                     a[8]a[5]
483	//         a[f]a[5]........................
484	//                                 a[8]a[6]
485	//     a[f]a[6]........................
486	//                             a[8]a[7]
487	// a[f]a[7]........................
488Lsqr8x_mul:
489	mul	x14,x6,x4
490	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
491	mul	x15,x7,x4
492	add	x27,x27,#8
493	mul	x16,x8,x4
494	mul	x17,x9,x4
495	adds	x19,x19,x14
496	mul	x14,x10,x4
497	adcs	x20,x20,x15
498	mul	x15,x11,x4
499	adcs	x21,x21,x16
500	mul	x16,x12,x4
501	adcs	x22,x22,x17
502	mul	x17,x13,x4
503	adcs	x23,x23,x14
504	umulh	x14,x6,x4
505	adcs	x24,x24,x15
506	umulh	x15,x7,x4
507	adcs	x25,x25,x16
508	umulh	x16,x8,x4
509	adcs	x26,x26,x17
510	umulh	x17,x9,x4
511	adc	x28,x28,xzr
512	str	x19,[x2],#8
513	adds	x19,x20,x14
514	umulh	x14,x10,x4
515	adcs	x20,x21,x15
516	umulh	x15,x11,x4
517	adcs	x21,x22,x16
518	umulh	x16,x12,x4
519	adcs	x22,x23,x17
520	umulh	x17,x13,x4
521	ldr	x4,[x0,x27]
522	adcs	x23,x24,x14
523	adcs	x24,x25,x15
524	adcs	x25,x26,x16
525	adcs	x26,x28,x17
526	//adc	x28,xzr,xzr		// moved above
527	cbnz	x27,Lsqr8x_mul
528					// note that carry flag is guaranteed
529					// to be zero at this point
530	cmp	x1,x3		// done yet?
531	b.eq	Lsqr8x_break
532
533	ldp	x6,x7,[x2,#8*0]
534	ldp	x8,x9,[x2,#8*2]
535	ldp	x10,x11,[x2,#8*4]
536	ldp	x12,x13,[x2,#8*6]
537	adds	x19,x19,x6
538	ldr	x4,[x0,#-8*8]
539	adcs	x20,x20,x7
540	ldp	x6,x7,[x1,#8*0]
541	adcs	x21,x21,x8
542	adcs	x22,x22,x9
543	ldp	x8,x9,[x1,#8*2]
544	adcs	x23,x23,x10
545	adcs	x24,x24,x11
546	ldp	x10,x11,[x1,#8*4]
547	adcs	x25,x25,x12
548	mov	x27,#-8*8
549	adcs	x26,x26,x13
550	ldp	x12,x13,[x1,#8*6]
551	add	x1,x1,#8*8
552	//adc	x28,xzr,xzr		// moved above
553	b	Lsqr8x_mul
554
555.align	4
556Lsqr8x_break:
557	ldp	x6,x7,[x0,#8*0]
558	add	x1,x0,#8*8
559	ldp	x8,x9,[x0,#8*2]
560	sub	x14,x3,x1		// is it last iteration?
561	ldp	x10,x11,[x0,#8*4]
562	sub	x15,x2,x14
563	ldp	x12,x13,[x0,#8*6]
564	cbz	x14,Lsqr8x_outer_loop
565
566	stp	x19,x20,[x2,#8*0]
567	ldp	x19,x20,[x15,#8*0]
568	stp	x21,x22,[x2,#8*2]
569	ldp	x21,x22,[x15,#8*2]
570	stp	x23,x24,[x2,#8*4]
571	ldp	x23,x24,[x15,#8*4]
572	stp	x25,x26,[x2,#8*6]
573	mov	x2,x15
574	ldp	x25,x26,[x15,#8*6]
575	b	Lsqr8x_outer_loop
576
577.align	4
578Lsqr8x_outer_break:
579	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
580	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
581	ldp	x15,x16,[sp,#8*1]
582	ldp	x11,x13,[x14,#8*2]
583	add	x1,x14,#8*4
584	ldp	x17,x14,[sp,#8*3]
585
586	stp	x19,x20,[x2,#8*0]
587	mul	x19,x7,x7
588	stp	x21,x22,[x2,#8*2]
589	umulh	x7,x7,x7
590	stp	x23,x24,[x2,#8*4]
591	mul	x8,x9,x9
592	stp	x25,x26,[x2,#8*6]
593	mov	x2,sp
594	umulh	x9,x9,x9
595	adds	x20,x7,x15,lsl#1
596	extr	x15,x16,x15,#63
597	sub	x27,x5,#8*4
598
599Lsqr4x_shift_n_add:
600	adcs	x21,x8,x15
601	extr	x16,x17,x16,#63
602	sub	x27,x27,#8*4
603	adcs	x22,x9,x16
604	ldp	x15,x16,[x2,#8*5]
605	mul	x10,x11,x11
606	ldp	x7,x9,[x1],#8*2
607	umulh	x11,x11,x11
608	mul	x12,x13,x13
609	umulh	x13,x13,x13
610	extr	x17,x14,x17,#63
611	stp	x19,x20,[x2,#8*0]
612	adcs	x23,x10,x17
613	extr	x14,x15,x14,#63
614	stp	x21,x22,[x2,#8*2]
615	adcs	x24,x11,x14
616	ldp	x17,x14,[x2,#8*7]
617	extr	x15,x16,x15,#63
618	adcs	x25,x12,x15
619	extr	x16,x17,x16,#63
620	adcs	x26,x13,x16
621	ldp	x15,x16,[x2,#8*9]
622	mul	x6,x7,x7
623	ldp	x11,x13,[x1],#8*2
624	umulh	x7,x7,x7
625	mul	x8,x9,x9
626	umulh	x9,x9,x9
627	stp	x23,x24,[x2,#8*4]
628	extr	x17,x14,x17,#63
629	stp	x25,x26,[x2,#8*6]
630	add	x2,x2,#8*8
631	adcs	x19,x6,x17
632	extr	x14,x15,x14,#63
633	adcs	x20,x7,x14
634	ldp	x17,x14,[x2,#8*3]
635	extr	x15,x16,x15,#63
636	cbnz	x27,Lsqr4x_shift_n_add
637	ldp	x1,x4,[x29,#104]	// pull np and n0
638
639	adcs	x21,x8,x15
640	extr	x16,x17,x16,#63
641	adcs	x22,x9,x16
642	ldp	x15,x16,[x2,#8*5]
643	mul	x10,x11,x11
644	umulh	x11,x11,x11
645	stp	x19,x20,[x2,#8*0]
646	mul	x12,x13,x13
647	umulh	x13,x13,x13
648	stp	x21,x22,[x2,#8*2]
649	extr	x17,x14,x17,#63
650	adcs	x23,x10,x17
651	extr	x14,x15,x14,#63
652	ldp	x19,x20,[sp,#8*0]
653	adcs	x24,x11,x14
654	extr	x15,x16,x15,#63
655	ldp	x6,x7,[x1,#8*0]
656	adcs	x25,x12,x15
657	extr	x16,xzr,x16,#63
658	ldp	x8,x9,[x1,#8*2]
659	adc	x26,x13,x16
660	ldp	x10,x11,[x1,#8*4]
661
662	// Reduce by 512 bits per iteration
663	mul	x28,x4,x19		// t[0]*n0
664	ldp	x12,x13,[x1,#8*6]
665	add	x3,x1,x5
666	ldp	x21,x22,[sp,#8*2]
667	stp	x23,x24,[x2,#8*4]
668	ldp	x23,x24,[sp,#8*4]
669	stp	x25,x26,[x2,#8*6]
670	ldp	x25,x26,[sp,#8*6]
671	add	x1,x1,#8*8
672	mov	x30,xzr		// initial top-most carry
673	mov	x2,sp
674	mov	x27,#8
675
676Lsqr8x_reduction:
677	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
678	mul	x15,x7,x28
679	sub	x27,x27,#1
680	mul	x16,x8,x28
681	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
682	mul	x17,x9,x28
683	// (*)	adds	xzr,x19,x14
684	subs	xzr,x19,#1		// (*)
685	mul	x14,x10,x28
686	adcs	x19,x20,x15
687	mul	x15,x11,x28
688	adcs	x20,x21,x16
689	mul	x16,x12,x28
690	adcs	x21,x22,x17
691	mul	x17,x13,x28
692	adcs	x22,x23,x14
693	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
694	adcs	x23,x24,x15
695	umulh	x15,x7,x28
696	adcs	x24,x25,x16
697	umulh	x16,x8,x28
698	adcs	x25,x26,x17
699	umulh	x17,x9,x28
700	adc	x26,xzr,xzr
701	adds	x19,x19,x14
702	umulh	x14,x10,x28
703	adcs	x20,x20,x15
704	umulh	x15,x11,x28
705	adcs	x21,x21,x16
706	umulh	x16,x12,x28
707	adcs	x22,x22,x17
708	umulh	x17,x13,x28
709	mul	x28,x4,x19		// next t[0]*n0
710	adcs	x23,x23,x14
711	adcs	x24,x24,x15
712	adcs	x25,x25,x16
713	adc	x26,x26,x17
714	cbnz	x27,Lsqr8x_reduction
715
716	ldp	x14,x15,[x2,#8*0]
717	ldp	x16,x17,[x2,#8*2]
718	mov	x0,x2
719	sub	x27,x3,x1	// done yet?
720	adds	x19,x19,x14
721	adcs	x20,x20,x15
722	ldp	x14,x15,[x2,#8*4]
723	adcs	x21,x21,x16
724	adcs	x22,x22,x17
725	ldp	x16,x17,[x2,#8*6]
726	adcs	x23,x23,x14
727	adcs	x24,x24,x15
728	adcs	x25,x25,x16
729	adcs	x26,x26,x17
730	//adc	x28,xzr,xzr		// moved below
731	cbz	x27,Lsqr8x8_post_condition
732
733	ldr	x4,[x2,#-8*8]
734	ldp	x6,x7,[x1,#8*0]
735	ldp	x8,x9,[x1,#8*2]
736	ldp	x10,x11,[x1,#8*4]
737	mov	x27,#-8*8
738	ldp	x12,x13,[x1,#8*6]
739	add	x1,x1,#8*8
740
741Lsqr8x_tail:
742	mul	x14,x6,x4
743	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
744	mul	x15,x7,x4
745	add	x27,x27,#8
746	mul	x16,x8,x4
747	mul	x17,x9,x4
748	adds	x19,x19,x14
749	mul	x14,x10,x4
750	adcs	x20,x20,x15
751	mul	x15,x11,x4
752	adcs	x21,x21,x16
753	mul	x16,x12,x4
754	adcs	x22,x22,x17
755	mul	x17,x13,x4
756	adcs	x23,x23,x14
757	umulh	x14,x6,x4
758	adcs	x24,x24,x15
759	umulh	x15,x7,x4
760	adcs	x25,x25,x16
761	umulh	x16,x8,x4
762	adcs	x26,x26,x17
763	umulh	x17,x9,x4
764	adc	x28,x28,xzr
765	str	x19,[x2],#8
766	adds	x19,x20,x14
767	umulh	x14,x10,x4
768	adcs	x20,x21,x15
769	umulh	x15,x11,x4
770	adcs	x21,x22,x16
771	umulh	x16,x12,x4
772	adcs	x22,x23,x17
773	umulh	x17,x13,x4
774	ldr	x4,[x0,x27]
775	adcs	x23,x24,x14
776	adcs	x24,x25,x15
777	adcs	x25,x26,x16
778	adcs	x26,x28,x17
779	//adc	x28,xzr,xzr		// moved above
780	cbnz	x27,Lsqr8x_tail
781					// note that carry flag is guaranteed
782					// to be zero at this point
783	ldp	x6,x7,[x2,#8*0]
784	sub	x27,x3,x1	// done yet?
785	sub	x16,x3,x5	// rewinded np
786	ldp	x8,x9,[x2,#8*2]
787	ldp	x10,x11,[x2,#8*4]
788	ldp	x12,x13,[x2,#8*6]
789	cbz	x27,Lsqr8x_tail_break
790
791	ldr	x4,[x0,#-8*8]
792	adds	x19,x19,x6
793	adcs	x20,x20,x7
794	ldp	x6,x7,[x1,#8*0]
795	adcs	x21,x21,x8
796	adcs	x22,x22,x9
797	ldp	x8,x9,[x1,#8*2]
798	adcs	x23,x23,x10
799	adcs	x24,x24,x11
800	ldp	x10,x11,[x1,#8*4]
801	adcs	x25,x25,x12
802	mov	x27,#-8*8
803	adcs	x26,x26,x13
804	ldp	x12,x13,[x1,#8*6]
805	add	x1,x1,#8*8
806	//adc	x28,xzr,xzr		// moved above
807	b	Lsqr8x_tail
808
809.align	4
810Lsqr8x_tail_break:
811	ldr	x4,[x29,#112]		// pull n0
812	add	x27,x2,#8*8		// end of current t[num] window
813
814	subs	xzr,x30,#1		// "move" top-most carry to carry bit
815	adcs	x14,x19,x6
816	adcs	x15,x20,x7
817	ldp	x19,x20,[x0,#8*0]
818	adcs	x21,x21,x8
819	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
820	adcs	x22,x22,x9
821	ldp	x8,x9,[x16,#8*2]
822	adcs	x23,x23,x10
823	adcs	x24,x24,x11
824	ldp	x10,x11,[x16,#8*4]
825	adcs	x25,x25,x12
826	adcs	x26,x26,x13
827	ldp	x12,x13,[x16,#8*6]
828	add	x1,x16,#8*8
829	adc	x30,xzr,xzr	// top-most carry
830	mul	x28,x4,x19
831	stp	x14,x15,[x2,#8*0]
832	stp	x21,x22,[x2,#8*2]
833	ldp	x21,x22,[x0,#8*2]
834	stp	x23,x24,[x2,#8*4]
835	ldp	x23,x24,[x0,#8*4]
836	cmp	x27,x29		// did we hit the bottom?
837	stp	x25,x26,[x2,#8*6]
838	mov	x2,x0			// slide the window
839	ldp	x25,x26,[x0,#8*6]
840	mov	x27,#8
841	b.ne	Lsqr8x_reduction
842
843	// Final step. We see if result is larger than modulus, and
844	// if it is, subtract the modulus. But comparison implies
845	// subtraction. So we subtract modulus, see if it borrowed,
846	// and conditionally copy original value.
847	ldr	x0,[x29,#96]		// pull rp
848	add	x2,x2,#8*8
849	subs	x14,x19,x6
850	sbcs	x15,x20,x7
851	sub	x27,x5,#8*8
852	mov	x3,x0		// x0 copy
853
854Lsqr8x_sub:
855	sbcs	x16,x21,x8
856	ldp	x6,x7,[x1,#8*0]
857	sbcs	x17,x22,x9
858	stp	x14,x15,[x0,#8*0]
859	sbcs	x14,x23,x10
860	ldp	x8,x9,[x1,#8*2]
861	sbcs	x15,x24,x11
862	stp	x16,x17,[x0,#8*2]
863	sbcs	x16,x25,x12
864	ldp	x10,x11,[x1,#8*4]
865	sbcs	x17,x26,x13
866	ldp	x12,x13,[x1,#8*6]
867	add	x1,x1,#8*8
868	ldp	x19,x20,[x2,#8*0]
869	sub	x27,x27,#8*8
870	ldp	x21,x22,[x2,#8*2]
871	ldp	x23,x24,[x2,#8*4]
872	ldp	x25,x26,[x2,#8*6]
873	add	x2,x2,#8*8
874	stp	x14,x15,[x0,#8*4]
875	sbcs	x14,x19,x6
876	stp	x16,x17,[x0,#8*6]
877	add	x0,x0,#8*8
878	sbcs	x15,x20,x7
879	cbnz	x27,Lsqr8x_sub
880
881	sbcs	x16,x21,x8
882	mov	x2,sp
883	add	x1,sp,x5
884	ldp	x6,x7,[x3,#8*0]
885	sbcs	x17,x22,x9
886	stp	x14,x15,[x0,#8*0]
887	sbcs	x14,x23,x10
888	ldp	x8,x9,[x3,#8*2]
889	sbcs	x15,x24,x11
890	stp	x16,x17,[x0,#8*2]
891	sbcs	x16,x25,x12
892	ldp	x19,x20,[x1,#8*0]
893	sbcs	x17,x26,x13
894	ldp	x21,x22,[x1,#8*2]
895	sbcs	xzr,x30,xzr	// did it borrow?
896	ldr	x30,[x29,#8]		// pull return address
897	stp	x14,x15,[x0,#8*4]
898	stp	x16,x17,[x0,#8*6]
899
900	sub	x27,x5,#8*4
901Lsqr4x_cond_copy:
902	sub	x27,x27,#8*4
903	csel	x14,x19,x6,lo
904	stp	xzr,xzr,[x2,#8*0]
905	csel	x15,x20,x7,lo
906	ldp	x6,x7,[x3,#8*4]
907	ldp	x19,x20,[x1,#8*4]
908	csel	x16,x21,x8,lo
909	stp	xzr,xzr,[x2,#8*2]
910	add	x2,x2,#8*4
911	csel	x17,x22,x9,lo
912	ldp	x8,x9,[x3,#8*6]
913	ldp	x21,x22,[x1,#8*6]
914	add	x1,x1,#8*4
915	stp	x14,x15,[x3,#8*0]
916	stp	x16,x17,[x3,#8*2]
917	add	x3,x3,#8*4
918	stp	xzr,xzr,[x1,#8*0]
919	stp	xzr,xzr,[x1,#8*2]
920	cbnz	x27,Lsqr4x_cond_copy
921
922	csel	x14,x19,x6,lo
923	stp	xzr,xzr,[x2,#8*0]
924	csel	x15,x20,x7,lo
925	stp	xzr,xzr,[x2,#8*2]
926	csel	x16,x21,x8,lo
927	csel	x17,x22,x9,lo
928	stp	x14,x15,[x3,#8*0]
929	stp	x16,x17,[x3,#8*2]
930
931	b	Lsqr8x_done
932
933.align	4
934Lsqr8x8_post_condition:
935	adc	x28,xzr,xzr
936	ldr	x30,[x29,#8]		// pull return address
937	// x19-7,x28 hold result, x6-7 hold modulus
938	subs	x6,x19,x6
939	ldr	x1,[x29,#96]		// pull rp
940	sbcs	x7,x20,x7
941	stp	xzr,xzr,[sp,#8*0]
942	sbcs	x8,x21,x8
943	stp	xzr,xzr,[sp,#8*2]
944	sbcs	x9,x22,x9
945	stp	xzr,xzr,[sp,#8*4]
946	sbcs	x10,x23,x10
947	stp	xzr,xzr,[sp,#8*6]
948	sbcs	x11,x24,x11
949	stp	xzr,xzr,[sp,#8*8]
950	sbcs	x12,x25,x12
951	stp	xzr,xzr,[sp,#8*10]
952	sbcs	x13,x26,x13
953	stp	xzr,xzr,[sp,#8*12]
954	sbcs	x28,x28,xzr	// did it borrow?
955	stp	xzr,xzr,[sp,#8*14]
956
957	// x6-7 hold result-modulus
958	csel	x6,x19,x6,lo
959	csel	x7,x20,x7,lo
960	csel	x8,x21,x8,lo
961	csel	x9,x22,x9,lo
962	stp	x6,x7,[x1,#8*0]
963	csel	x10,x23,x10,lo
964	csel	x11,x24,x11,lo
965	stp	x8,x9,[x1,#8*2]
966	csel	x12,x25,x12,lo
967	csel	x13,x26,x13,lo
968	stp	x10,x11,[x1,#8*4]
969	stp	x12,x13,[x1,#8*6]
970
971Lsqr8x_done:
972	ldp	x19,x20,[x29,#16]
973	mov	sp,x29
974	ldp	x21,x22,[x29,#32]
975	mov	x0,#1
976	ldp	x23,x24,[x29,#48]
977	ldp	x25,x26,[x29,#64]
978	ldp	x27,x28,[x29,#80]
979	ldr	x29,[sp],#128
980	// x30 is popped earlier
981	AARCH64_VALIDATE_LINK_REGISTER
982	ret
983
984
985.align	5
986__bn_mul4x_mont:
987	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
988	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
989	// return address.
990	stp	x29,x30,[sp,#-128]!
991	add	x29,sp,#0
992	stp	x19,x20,[sp,#16]
993	stp	x21,x22,[sp,#32]
994	stp	x23,x24,[sp,#48]
995	stp	x25,x26,[sp,#64]
996	stp	x27,x28,[sp,#80]
997
998	sub	x26,sp,x5,lsl#3
999	lsl	x5,x5,#3
1000	ldr	x4,[x4]		// *n0
1001	sub	sp,x26,#8*4		// alloca
1002
1003	add	x10,x2,x5
1004	add	x27,x1,x5
1005	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1006
1007	ldr	x24,[x2,#8*0]		// b[0]
1008	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1009	ldp	x8,x9,[x1,#8*2]
1010	add	x1,x1,#8*4
1011	mov	x19,xzr
1012	mov	x20,xzr
1013	mov	x21,xzr
1014	mov	x22,xzr
1015	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1016	ldp	x16,x17,[x3,#8*2]
1017	adds	x3,x3,#8*4		// clear carry bit
1018	mov	x0,xzr
1019	mov	x28,#0
1020	mov	x26,sp
1021
1022Loop_mul4x_1st_reduction:
1023	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1024	adc	x0,x0,xzr	// modulo-scheduled
1025	mul	x11,x7,x24
1026	add	x28,x28,#8
1027	mul	x12,x8,x24
1028	and	x28,x28,#31
1029	mul	x13,x9,x24
1030	adds	x19,x19,x10
1031	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1032	adcs	x20,x20,x11
1033	mul	x25,x19,x4		// t[0]*n0
1034	adcs	x21,x21,x12
1035	umulh	x11,x7,x24
1036	adcs	x22,x22,x13
1037	umulh	x12,x8,x24
1038	adc	x23,xzr,xzr
1039	umulh	x13,x9,x24
1040	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1041	adds	x20,x20,x10
1042	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1043	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1044	adcs	x21,x21,x11
1045	mul	x11,x15,x25
1046	adcs	x22,x22,x12
1047	mul	x12,x16,x25
1048	adc	x23,x23,x13		// can't overflow
1049	mul	x13,x17,x25
1050	// (*)	adds	xzr,x19,x10
1051	subs	xzr,x19,#1		// (*)
1052	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1053	adcs	x19,x20,x11
1054	umulh	x11,x15,x25
1055	adcs	x20,x21,x12
1056	umulh	x12,x16,x25
1057	adcs	x21,x22,x13
1058	umulh	x13,x17,x25
1059	adcs	x22,x23,x0
1060	adc	x0,xzr,xzr
1061	adds	x19,x19,x10
1062	sub	x10,x27,x1
1063	adcs	x20,x20,x11
1064	adcs	x21,x21,x12
1065	adcs	x22,x22,x13
1066	//adc	x0,x0,xzr
1067	cbnz	x28,Loop_mul4x_1st_reduction
1068
1069	cbz	x10,Lmul4x4_post_condition
1070
1071	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1072	ldp	x8,x9,[x1,#8*2]
1073	add	x1,x1,#8*4
1074	ldr	x25,[sp]		// a[0]*n0
1075	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1076	ldp	x16,x17,[x3,#8*2]
1077	add	x3,x3,#8*4
1078
1079Loop_mul4x_1st_tail:
1080	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1081	adc	x0,x0,xzr	// modulo-scheduled
1082	mul	x11,x7,x24
1083	add	x28,x28,#8
1084	mul	x12,x8,x24
1085	and	x28,x28,#31
1086	mul	x13,x9,x24
1087	adds	x19,x19,x10
1088	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1089	adcs	x20,x20,x11
1090	umulh	x11,x7,x24
1091	adcs	x21,x21,x12
1092	umulh	x12,x8,x24
1093	adcs	x22,x22,x13
1094	umulh	x13,x9,x24
1095	adc	x23,xzr,xzr
1096	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1097	adds	x20,x20,x10
1098	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1099	adcs	x21,x21,x11
1100	mul	x11,x15,x25
1101	adcs	x22,x22,x12
1102	mul	x12,x16,x25
1103	adc	x23,x23,x13		// can't overflow
1104	mul	x13,x17,x25
1105	adds	x19,x19,x10
1106	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1107	adcs	x20,x20,x11
1108	umulh	x11,x15,x25
1109	adcs	x21,x21,x12
1110	umulh	x12,x16,x25
1111	adcs	x22,x22,x13
1112	adcs	x23,x23,x0
1113	umulh	x13,x17,x25
1114	adc	x0,xzr,xzr
1115	ldr	x25,[sp,x28]		// next t[0]*n0
1116	str	x19,[x26],#8		// result!!!
1117	adds	x19,x20,x10
1118	sub	x10,x27,x1		// done yet?
1119	adcs	x20,x21,x11
1120	adcs	x21,x22,x12
1121	adcs	x22,x23,x13
1122	//adc	x0,x0,xzr
1123	cbnz	x28,Loop_mul4x_1st_tail
1124
1125	sub	x11,x27,x5	// rewinded x1
1126	cbz	x10,Lmul4x_proceed
1127
1128	ldp	x6,x7,[x1,#8*0]
1129	ldp	x8,x9,[x1,#8*2]
1130	add	x1,x1,#8*4
1131	ldp	x14,x15,[x3,#8*0]
1132	ldp	x16,x17,[x3,#8*2]
1133	add	x3,x3,#8*4
1134	b	Loop_mul4x_1st_tail
1135
1136.align	5
1137Lmul4x_proceed:
1138	ldr	x24,[x2,#8*4]!		// *++b
1139	adc	x30,x0,xzr
1140	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1141	sub	x3,x3,x5		// rewind np
1142	ldp	x8,x9,[x11,#8*2]
1143	add	x1,x11,#8*4
1144
1145	stp	x19,x20,[x26,#8*0]	// result!!!
1146	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1147	stp	x21,x22,[x26,#8*2]	// result!!!
1148	ldp	x21,x22,[sp,#8*6]
1149
1150	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1151	mov	x26,sp
1152	ldp	x16,x17,[x3,#8*2]
1153	adds	x3,x3,#8*4		// clear carry bit
1154	mov	x0,xzr
1155
1156.align	4
1157Loop_mul4x_reduction:
1158	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1159	adc	x0,x0,xzr	// modulo-scheduled
1160	mul	x11,x7,x24
1161	add	x28,x28,#8
1162	mul	x12,x8,x24
1163	and	x28,x28,#31
1164	mul	x13,x9,x24
1165	adds	x19,x19,x10
1166	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1167	adcs	x20,x20,x11
1168	mul	x25,x19,x4		// t[0]*n0
1169	adcs	x21,x21,x12
1170	umulh	x11,x7,x24
1171	adcs	x22,x22,x13
1172	umulh	x12,x8,x24
1173	adc	x23,xzr,xzr
1174	umulh	x13,x9,x24
1175	ldr	x24,[x2,x28]		// next b[i]
1176	adds	x20,x20,x10
1177	// (*)	mul	x10,x14,x25
1178	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1179	adcs	x21,x21,x11
1180	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1181	adcs	x22,x22,x12
1182	mul	x12,x16,x25
1183	adc	x23,x23,x13		// can't overflow
1184	mul	x13,x17,x25
1185	// (*)	adds	xzr,x19,x10
1186	subs	xzr,x19,#1		// (*)
1187	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1188	adcs	x19,x20,x11
1189	umulh	x11,x15,x25
1190	adcs	x20,x21,x12
1191	umulh	x12,x16,x25
1192	adcs	x21,x22,x13
1193	umulh	x13,x17,x25
1194	adcs	x22,x23,x0
1195	adc	x0,xzr,xzr
1196	adds	x19,x19,x10
1197	adcs	x20,x20,x11
1198	adcs	x21,x21,x12
1199	adcs	x22,x22,x13
1200	//adc	x0,x0,xzr
1201	cbnz	x28,Loop_mul4x_reduction
1202
1203	adc	x0,x0,xzr
1204	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1205	ldp	x12,x13,[x26,#8*6]
1206	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1207	ldp	x8,x9,[x1,#8*2]
1208	add	x1,x1,#8*4
1209	adds	x19,x19,x10
1210	adcs	x20,x20,x11
1211	adcs	x21,x21,x12
1212	adcs	x22,x22,x13
1213	//adc	x0,x0,xzr
1214
1215	ldr	x25,[sp]		// t[0]*n0
1216	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1217	ldp	x16,x17,[x3,#8*2]
1218	add	x3,x3,#8*4
1219
1220.align	4
1221Loop_mul4x_tail:
1222	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1223	adc	x0,x0,xzr	// modulo-scheduled
1224	mul	x11,x7,x24
1225	add	x28,x28,#8
1226	mul	x12,x8,x24
1227	and	x28,x28,#31
1228	mul	x13,x9,x24
1229	adds	x19,x19,x10
1230	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1231	adcs	x20,x20,x11
1232	umulh	x11,x7,x24
1233	adcs	x21,x21,x12
1234	umulh	x12,x8,x24
1235	adcs	x22,x22,x13
1236	umulh	x13,x9,x24
1237	adc	x23,xzr,xzr
1238	ldr	x24,[x2,x28]		// next b[i]
1239	adds	x20,x20,x10
1240	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1241	adcs	x21,x21,x11
1242	mul	x11,x15,x25
1243	adcs	x22,x22,x12
1244	mul	x12,x16,x25
1245	adc	x23,x23,x13		// can't overflow
1246	mul	x13,x17,x25
1247	adds	x19,x19,x10
1248	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1249	adcs	x20,x20,x11
1250	umulh	x11,x15,x25
1251	adcs	x21,x21,x12
1252	umulh	x12,x16,x25
1253	adcs	x22,x22,x13
1254	umulh	x13,x17,x25
1255	adcs	x23,x23,x0
1256	ldr	x25,[sp,x28]		// next a[0]*n0
1257	adc	x0,xzr,xzr
1258	str	x19,[x26],#8		// result!!!
1259	adds	x19,x20,x10
1260	sub	x10,x27,x1		// done yet?
1261	adcs	x20,x21,x11
1262	adcs	x21,x22,x12
1263	adcs	x22,x23,x13
1264	//adc	x0,x0,xzr
1265	cbnz	x28,Loop_mul4x_tail
1266
1267	sub	x11,x3,x5		// rewinded np?
1268	adc	x0,x0,xzr
1269	cbz	x10,Loop_mul4x_break
1270
1271	ldp	x10,x11,[x26,#8*4]
1272	ldp	x12,x13,[x26,#8*6]
1273	ldp	x6,x7,[x1,#8*0]
1274	ldp	x8,x9,[x1,#8*2]
1275	add	x1,x1,#8*4
1276	adds	x19,x19,x10
1277	adcs	x20,x20,x11
1278	adcs	x21,x21,x12
1279	adcs	x22,x22,x13
1280	//adc	x0,x0,xzr
1281	ldp	x14,x15,[x3,#8*0]
1282	ldp	x16,x17,[x3,#8*2]
1283	add	x3,x3,#8*4
1284	b	Loop_mul4x_tail
1285
1286.align	4
1287Loop_mul4x_break:
1288	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1289	adds	x19,x19,x30
1290	add	x2,x2,#8*4		// bp++
1291	adcs	x20,x20,xzr
1292	sub	x1,x1,x5		// rewind ap
1293	adcs	x21,x21,xzr
1294	stp	x19,x20,[x26,#8*0]	// result!!!
1295	adcs	x22,x22,xzr
1296	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1297	adc	x30,x0,xzr
1298	stp	x21,x22,[x26,#8*2]	// result!!!
1299	cmp	x2,x13			// done yet?
1300	ldp	x21,x22,[sp,#8*6]
1301	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1302	ldp	x16,x17,[x11,#8*2]
1303	add	x3,x11,#8*4
1304	b.eq	Lmul4x_post
1305
1306	ldr	x24,[x2]
1307	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1308	ldp	x8,x9,[x1,#8*2]
1309	adds	x1,x1,#8*4		// clear carry bit
1310	mov	x0,xzr
1311	mov	x26,sp
1312	b	Loop_mul4x_reduction
1313
1314.align	4
1315Lmul4x_post:
1316	// Final step. We see if result is larger than modulus, and
1317	// if it is, subtract the modulus. But comparison implies
1318	// subtraction. So we subtract modulus, see if it borrowed,
1319	// and conditionally copy original value.
1320	mov	x0,x12
1321	mov	x27,x12		// x0 copy
1322	subs	x10,x19,x14
1323	add	x26,sp,#8*8
1324	sbcs	x11,x20,x15
1325	sub	x28,x5,#8*4
1326
1327Lmul4x_sub:
1328	sbcs	x12,x21,x16
1329	ldp	x14,x15,[x3,#8*0]
1330	sub	x28,x28,#8*4
1331	ldp	x19,x20,[x26,#8*0]
1332	sbcs	x13,x22,x17
1333	ldp	x16,x17,[x3,#8*2]
1334	add	x3,x3,#8*4
1335	ldp	x21,x22,[x26,#8*2]
1336	add	x26,x26,#8*4
1337	stp	x10,x11,[x0,#8*0]
1338	sbcs	x10,x19,x14
1339	stp	x12,x13,[x0,#8*2]
1340	add	x0,x0,#8*4
1341	sbcs	x11,x20,x15
1342	cbnz	x28,Lmul4x_sub
1343
1344	sbcs	x12,x21,x16
1345	mov	x26,sp
1346	add	x1,sp,#8*4
1347	ldp	x6,x7,[x27,#8*0]
1348	sbcs	x13,x22,x17
1349	stp	x10,x11,[x0,#8*0]
1350	ldp	x8,x9,[x27,#8*2]
1351	stp	x12,x13,[x0,#8*2]
1352	ldp	x19,x20,[x1,#8*0]
1353	ldp	x21,x22,[x1,#8*2]
1354	sbcs	xzr,x30,xzr	// did it borrow?
1355	ldr	x30,[x29,#8]		// pull return address
1356
1357	sub	x28,x5,#8*4
1358Lmul4x_cond_copy:
1359	sub	x28,x28,#8*4
1360	csel	x10,x19,x6,lo
1361	stp	xzr,xzr,[x26,#8*0]
1362	csel	x11,x20,x7,lo
1363	ldp	x6,x7,[x27,#8*4]
1364	ldp	x19,x20,[x1,#8*4]
1365	csel	x12,x21,x8,lo
1366	stp	xzr,xzr,[x26,#8*2]
1367	add	x26,x26,#8*4
1368	csel	x13,x22,x9,lo
1369	ldp	x8,x9,[x27,#8*6]
1370	ldp	x21,x22,[x1,#8*6]
1371	add	x1,x1,#8*4
1372	stp	x10,x11,[x27,#8*0]
1373	stp	x12,x13,[x27,#8*2]
1374	add	x27,x27,#8*4
1375	cbnz	x28,Lmul4x_cond_copy
1376
1377	csel	x10,x19,x6,lo
1378	stp	xzr,xzr,[x26,#8*0]
1379	csel	x11,x20,x7,lo
1380	stp	xzr,xzr,[x26,#8*2]
1381	csel	x12,x21,x8,lo
1382	stp	xzr,xzr,[x26,#8*3]
1383	csel	x13,x22,x9,lo
1384	stp	xzr,xzr,[x26,#8*4]
1385	stp	x10,x11,[x27,#8*0]
1386	stp	x12,x13,[x27,#8*2]
1387
1388	b	Lmul4x_done
1389
1390.align	4
1391Lmul4x4_post_condition:
1392	adc	x0,x0,xzr
1393	ldr	x1,[x29,#96]		// pull rp
1394	// x19-3,x0 hold result, x14-7 hold modulus
1395	subs	x6,x19,x14
1396	ldr	x30,[x29,#8]		// pull return address
1397	sbcs	x7,x20,x15
1398	stp	xzr,xzr,[sp,#8*0]
1399	sbcs	x8,x21,x16
1400	stp	xzr,xzr,[sp,#8*2]
1401	sbcs	x9,x22,x17
1402	stp	xzr,xzr,[sp,#8*4]
1403	sbcs	xzr,x0,xzr		// did it borrow?
1404	stp	xzr,xzr,[sp,#8*6]
1405
1406	// x6-3 hold result-modulus
1407	csel	x6,x19,x6,lo
1408	csel	x7,x20,x7,lo
1409	csel	x8,x21,x8,lo
1410	csel	x9,x22,x9,lo
1411	stp	x6,x7,[x1,#8*0]
1412	stp	x8,x9,[x1,#8*2]
1413
1414Lmul4x_done:
1415	ldp	x19,x20,[x29,#16]
1416	mov	sp,x29
1417	ldp	x21,x22,[x29,#32]
1418	mov	x0,#1
1419	ldp	x23,x24,[x29,#48]
1420	ldp	x25,x26,[x29,#64]
1421	ldp	x27,x28,[x29,#80]
1422	ldr	x29,[sp],#128
1423	// x30 is popped earlier
1424	AARCH64_VALIDATE_LINK_REGISTER
1425	ret
1426
1427.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1428.align	2
1429.align	4
1430#endif  // !OPENSSL_NO_ASM
1431