• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18.text
19
20.globl	bn_mul_mont
21
22.def bn_mul_mont
23   .type 32
24.endef
25.align	5
26bn_mul_mont:
27	AARCH64_SIGN_LINK_REGISTER
28	tst	x5,#7
29	b.eq	__bn_sqr8x_mont
30	tst	x5,#3
31	b.eq	__bn_mul4x_mont
32Lmul_mont:
33	stp	x29,x30,[sp,#-64]!
34	add	x29,sp,#0
35	stp	x19,x20,[sp,#16]
36	stp	x21,x22,[sp,#32]
37	stp	x23,x24,[sp,#48]
38
39	ldr	x9,[x2],#8		// bp[0]
40	sub	x22,sp,x5,lsl#3
41	ldp	x7,x8,[x1],#16	// ap[0..1]
42	lsl	x5,x5,#3
43	ldr	x4,[x4]		// *n0
44	and	x22,x22,#-16		// ABI says so
45	ldp	x13,x14,[x3],#16	// np[0..1]
46
47	mul	x6,x7,x9		// ap[0]*bp[0]
48	sub	x21,x5,#16		// j=num-2
49	umulh	x7,x7,x9
50	mul	x10,x8,x9		// ap[1]*bp[0]
51	umulh	x11,x8,x9
52
53	mul	x15,x6,x4		// "tp[0]"*n0
54	mov	sp,x22			// alloca
55
56	// (*)	mul	x12,x13,x15	// np[0]*m1
57	umulh	x13,x13,x15
58	mul	x16,x14,x15		// np[1]*m1
59	// (*)	adds	x12,x12,x6	// discarded
60	// (*)	As for removal of first multiplication and addition
61	//	instructions. The outcome of first addition is
62	//	guaranteed to be zero, which leaves two computationally
63	//	significant outcomes: it either carries or not. Then
64	//	question is when does it carry? Is there alternative
65	//	way to deduce it? If you follow operations, you can
66	//	observe that condition for carry is quite simple:
67	//	x6 being non-zero. So that carry can be calculated
68	//	by adding -1 to x6. That's what next instruction does.
69	subs	xzr,x6,#1		// (*)
70	umulh	x17,x14,x15
71	adc	x13,x13,xzr
72	cbz	x21,L1st_skip
73
74L1st:
75	ldr	x8,[x1],#8
76	adds	x6,x10,x7
77	sub	x21,x21,#8		// j--
78	adc	x7,x11,xzr
79
80	ldr	x14,[x3],#8
81	adds	x12,x16,x13
82	mul	x10,x8,x9		// ap[j]*bp[0]
83	adc	x13,x17,xzr
84	umulh	x11,x8,x9
85
86	adds	x12,x12,x6
87	mul	x16,x14,x15		// np[j]*m1
88	adc	x13,x13,xzr
89	umulh	x17,x14,x15
90	str	x12,[x22],#8		// tp[j-1]
91	cbnz	x21,L1st
92
93L1st_skip:
94	adds	x6,x10,x7
95	sub	x1,x1,x5		// rewind x1
96	adc	x7,x11,xzr
97
98	adds	x12,x16,x13
99	sub	x3,x3,x5		// rewind x3
100	adc	x13,x17,xzr
101
102	adds	x12,x12,x6
103	sub	x20,x5,#8		// i=num-1
104	adcs	x13,x13,x7
105
106	adc	x19,xzr,xzr		// upmost overflow bit
107	stp	x12,x13,[x22]
108
109Louter:
110	ldr	x9,[x2],#8		// bp[i]
111	ldp	x7,x8,[x1],#16
112	ldr	x23,[sp]		// tp[0]
113	add	x22,sp,#8
114
115	mul	x6,x7,x9		// ap[0]*bp[i]
116	sub	x21,x5,#16		// j=num-2
117	umulh	x7,x7,x9
118	ldp	x13,x14,[x3],#16
119	mul	x10,x8,x9		// ap[1]*bp[i]
120	adds	x6,x6,x23
121	umulh	x11,x8,x9
122	adc	x7,x7,xzr
123
124	mul	x15,x6,x4
125	sub	x20,x20,#8		// i--
126
127	// (*)	mul	x12,x13,x15	// np[0]*m1
128	umulh	x13,x13,x15
129	mul	x16,x14,x15		// np[1]*m1
130	// (*)	adds	x12,x12,x6
131	subs	xzr,x6,#1		// (*)
132	umulh	x17,x14,x15
133	cbz	x21,Linner_skip
134
135Linner:
136	ldr	x8,[x1],#8
137	adc	x13,x13,xzr
138	ldr	x23,[x22],#8		// tp[j]
139	adds	x6,x10,x7
140	sub	x21,x21,#8		// j--
141	adc	x7,x11,xzr
142
143	adds	x12,x16,x13
144	ldr	x14,[x3],#8
145	adc	x13,x17,xzr
146
147	mul	x10,x8,x9		// ap[j]*bp[i]
148	adds	x6,x6,x23
149	umulh	x11,x8,x9
150	adc	x7,x7,xzr
151
152	mul	x16,x14,x15		// np[j]*m1
153	adds	x12,x12,x6
154	umulh	x17,x14,x15
155	str	x12,[x22,#-16]		// tp[j-1]
156	cbnz	x21,Linner
157
158Linner_skip:
159	ldr	x23,[x22],#8		// tp[j]
160	adc	x13,x13,xzr
161	adds	x6,x10,x7
162	sub	x1,x1,x5		// rewind x1
163	adc	x7,x11,xzr
164
165	adds	x12,x16,x13
166	sub	x3,x3,x5		// rewind x3
167	adcs	x13,x17,x19
168	adc	x19,xzr,xzr
169
170	adds	x6,x6,x23
171	adc	x7,x7,xzr
172
173	adds	x12,x12,x6
174	adcs	x13,x13,x7
175	adc	x19,x19,xzr		// upmost overflow bit
176	stp	x12,x13,[x22,#-16]
177
178	cbnz	x20,Louter
179
180	// Final step. We see if result is larger than modulus, and
181	// if it is, subtract the modulus. But comparison implies
182	// subtraction. So we subtract modulus, see if it borrowed,
183	// and conditionally copy original value.
184	ldr	x23,[sp]		// tp[0]
185	add	x22,sp,#8
186	ldr	x14,[x3],#8		// np[0]
187	subs	x21,x5,#8		// j=num-1 and clear borrow
188	mov	x1,x0
189Lsub:
190	sbcs	x8,x23,x14		// tp[j]-np[j]
191	ldr	x23,[x22],#8
192	sub	x21,x21,#8		// j--
193	ldr	x14,[x3],#8
194	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
195	cbnz	x21,Lsub
196
197	sbcs	x8,x23,x14
198	sbcs	x19,x19,xzr		// did it borrow?
199	str	x8,[x1],#8		// rp[num-1]
200
201	ldr	x23,[sp]		// tp[0]
202	add	x22,sp,#8
203	ldr	x8,[x0],#8		// rp[0]
204	sub	x5,x5,#8		// num--
205	nop
206Lcond_copy:
207	sub	x5,x5,#8		// num--
208	csel	x14,x23,x8,lo		// did it borrow?
209	ldr	x23,[x22],#8
210	ldr	x8,[x0],#8
211	str	xzr,[x22,#-16]		// wipe tp
212	str	x14,[x0,#-16]
213	cbnz	x5,Lcond_copy
214
215	csel	x14,x23,x8,lo
216	str	xzr,[x22,#-8]		// wipe tp
217	str	x14,[x0,#-8]
218
219	ldp	x19,x20,[x29,#16]
220	mov	sp,x29
221	ldp	x21,x22,[x29,#32]
222	mov	x0,#1
223	ldp	x23,x24,[x29,#48]
224	ldr	x29,[sp],#64
225	AARCH64_VALIDATE_LINK_REGISTER
226	ret
227
228.def __bn_sqr8x_mont
229   .type 32
230.endef
231.align	5
232__bn_sqr8x_mont:
233	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
234	// only from bn_mul_mont which has already signed the return address.
235	cmp	x1,x2
236	b.ne	__bn_mul4x_mont
237Lsqr8x_mont:
238	stp	x29,x30,[sp,#-128]!
239	add	x29,sp,#0
240	stp	x19,x20,[sp,#16]
241	stp	x21,x22,[sp,#32]
242	stp	x23,x24,[sp,#48]
243	stp	x25,x26,[sp,#64]
244	stp	x27,x28,[sp,#80]
245	stp	x0,x3,[sp,#96]	// offload rp and np
246
247	ldp	x6,x7,[x1,#8*0]
248	ldp	x8,x9,[x1,#8*2]
249	ldp	x10,x11,[x1,#8*4]
250	ldp	x12,x13,[x1,#8*6]
251
252	sub	x2,sp,x5,lsl#4
253	lsl	x5,x5,#3
254	ldr	x4,[x4]		// *n0
255	mov	sp,x2			// alloca
256	sub	x27,x5,#8*8
257	b	Lsqr8x_zero_start
258
259Lsqr8x_zero:
260	sub	x27,x27,#8*8
261	stp	xzr,xzr,[x2,#8*0]
262	stp	xzr,xzr,[x2,#8*2]
263	stp	xzr,xzr,[x2,#8*4]
264	stp	xzr,xzr,[x2,#8*6]
265Lsqr8x_zero_start:
266	stp	xzr,xzr,[x2,#8*8]
267	stp	xzr,xzr,[x2,#8*10]
268	stp	xzr,xzr,[x2,#8*12]
269	stp	xzr,xzr,[x2,#8*14]
270	add	x2,x2,#8*16
271	cbnz	x27,Lsqr8x_zero
272
273	add	x3,x1,x5
274	add	x1,x1,#8*8
275	mov	x19,xzr
276	mov	x20,xzr
277	mov	x21,xzr
278	mov	x22,xzr
279	mov	x23,xzr
280	mov	x24,xzr
281	mov	x25,xzr
282	mov	x26,xzr
283	mov	x2,sp
284	str	x4,[x29,#112]		// offload n0
285
286	// Multiply everything but a[i]*a[i]
287.align	4
288Lsqr8x_outer_loop:
289        //                                                 a[1]a[0]	(i)
290        //                                             a[2]a[0]
291        //                                         a[3]a[0]
292        //                                     a[4]a[0]
293        //                                 a[5]a[0]
294        //                             a[6]a[0]
295        //                         a[7]a[0]
296        //                                         a[2]a[1]		(ii)
297        //                                     a[3]a[1]
298        //                                 a[4]a[1]
299        //                             a[5]a[1]
300        //                         a[6]a[1]
301        //                     a[7]a[1]
302        //                                 a[3]a[2]			(iii)
303        //                             a[4]a[2]
304        //                         a[5]a[2]
305        //                     a[6]a[2]
306        //                 a[7]a[2]
307        //                         a[4]a[3]				(iv)
308        //                     a[5]a[3]
309        //                 a[6]a[3]
310        //             a[7]a[3]
311        //                 a[5]a[4]					(v)
312        //             a[6]a[4]
313        //         a[7]a[4]
314        //         a[6]a[5]						(vi)
315        //     a[7]a[5]
316        // a[7]a[6]							(vii)
317
318	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
319	mul	x15,x8,x6
320	mul	x16,x9,x6
321	mul	x17,x10,x6
322	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
323	mul	x14,x11,x6
324	adcs	x21,x21,x15
325	mul	x15,x12,x6
326	adcs	x22,x22,x16
327	mul	x16,x13,x6
328	adcs	x23,x23,x17
329	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
330	adcs	x24,x24,x14
331	umulh	x14,x8,x6
332	adcs	x25,x25,x15
333	umulh	x15,x9,x6
334	adcs	x26,x26,x16
335	umulh	x16,x10,x6
336	stp	x19,x20,[x2],#8*2	// t[0..1]
337	adc	x19,xzr,xzr		// t[8]
338	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
339	umulh	x17,x11,x6
340	adcs	x22,x22,x14
341	umulh	x14,x12,x6
342	adcs	x23,x23,x15
343	umulh	x15,x13,x6
344	adcs	x24,x24,x16
345	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
346	adcs	x25,x25,x17
347	mul	x17,x9,x7
348	adcs	x26,x26,x14
349	mul	x14,x10,x7
350	adc	x19,x19,x15
351
352	mul	x15,x11,x7
353	adds	x22,x22,x16
354	mul	x16,x12,x7
355	adcs	x23,x23,x17
356	mul	x17,x13,x7
357	adcs	x24,x24,x14
358	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
359	adcs	x25,x25,x15
360	umulh	x15,x9,x7
361	adcs	x26,x26,x16
362	umulh	x16,x10,x7
363	adcs	x19,x19,x17
364	umulh	x17,x11,x7
365	stp	x21,x22,[x2],#8*2	// t[2..3]
366	adc	x20,xzr,xzr		// t[9]
367	adds	x23,x23,x14
368	umulh	x14,x12,x7
369	adcs	x24,x24,x15
370	umulh	x15,x13,x7
371	adcs	x25,x25,x16
372	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
373	adcs	x26,x26,x17
374	mul	x17,x10,x8
375	adcs	x19,x19,x14
376	mul	x14,x11,x8
377	adc	x20,x20,x15
378
379	mul	x15,x12,x8
380	adds	x24,x24,x16
381	mul	x16,x13,x8
382	adcs	x25,x25,x17
383	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
384	adcs	x26,x26,x14
385	umulh	x14,x10,x8
386	adcs	x19,x19,x15
387	umulh	x15,x11,x8
388	adcs	x20,x20,x16
389	umulh	x16,x12,x8
390	stp	x23,x24,[x2],#8*2	// t[4..5]
391	adc	x21,xzr,xzr		// t[10]
392	adds	x25,x25,x17
393	umulh	x17,x13,x8
394	adcs	x26,x26,x14
395	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
396	adcs	x19,x19,x15
397	mul	x15,x11,x9
398	adcs	x20,x20,x16
399	mul	x16,x12,x9
400	adc	x21,x21,x17
401
402	mul	x17,x13,x9
403	adds	x26,x26,x14
404	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
405	adcs	x19,x19,x15
406	umulh	x15,x11,x9
407	adcs	x20,x20,x16
408	umulh	x16,x12,x9
409	adcs	x21,x21,x17
410	umulh	x17,x13,x9
411	stp	x25,x26,[x2],#8*2	// t[6..7]
412	adc	x22,xzr,xzr		// t[11]
413	adds	x19,x19,x14
414	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
415	adcs	x20,x20,x15
416	mul	x15,x12,x10
417	adcs	x21,x21,x16
418	mul	x16,x13,x10
419	adc	x22,x22,x17
420
421	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
422	adds	x20,x20,x14
423	umulh	x14,x12,x10
424	adcs	x21,x21,x15
425	umulh	x15,x13,x10
426	adcs	x22,x22,x16
427	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
428	adc	x23,xzr,xzr		// t[12]
429	adds	x21,x21,x17
430	mul	x17,x13,x11
431	adcs	x22,x22,x14
432	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
433	adc	x23,x23,x15
434
435	umulh	x15,x13,x11
436	adds	x22,x22,x16
437	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
438	adcs	x23,x23,x17
439	umulh	x17,x13,x12		// hi(a[7]*a[6])
440	adc	x24,xzr,xzr		// t[13]
441	adds	x23,x23,x14
442	sub	x27,x3,x1	// done yet?
443	adc	x24,x24,x15
444
445	adds	x24,x24,x16
446	sub	x14,x3,x5	// rewinded ap
447	adc	x25,xzr,xzr		// t[14]
448	add	x25,x25,x17
449
450	cbz	x27,Lsqr8x_outer_break
451
452	mov	x4,x6
453	ldp	x6,x7,[x2,#8*0]
454	ldp	x8,x9,[x2,#8*2]
455	ldp	x10,x11,[x2,#8*4]
456	ldp	x12,x13,[x2,#8*6]
457	adds	x19,x19,x6
458	adcs	x20,x20,x7
459	ldp	x6,x7,[x1,#8*0]
460	adcs	x21,x21,x8
461	adcs	x22,x22,x9
462	ldp	x8,x9,[x1,#8*2]
463	adcs	x23,x23,x10
464	adcs	x24,x24,x11
465	ldp	x10,x11,[x1,#8*4]
466	adcs	x25,x25,x12
467	mov	x0,x1
468	adcs	x26,xzr,x13
469	ldp	x12,x13,[x1,#8*6]
470	add	x1,x1,#8*8
471	//adc	x28,xzr,xzr		// moved below
472	mov	x27,#-8*8
473
474	//                                                         a[8]a[0]
475	//                                                     a[9]a[0]
476	//                                                 a[a]a[0]
477	//                                             a[b]a[0]
478	//                                         a[c]a[0]
479	//                                     a[d]a[0]
480	//                                 a[e]a[0]
481	//                             a[f]a[0]
482	//                                                     a[8]a[1]
483	//                         a[f]a[1]........................
484	//                                                 a[8]a[2]
485	//                     a[f]a[2]........................
486	//                                             a[8]a[3]
487	//                 a[f]a[3]........................
488	//                                         a[8]a[4]
489	//             a[f]a[4]........................
490	//                                     a[8]a[5]
491	//         a[f]a[5]........................
492	//                                 a[8]a[6]
493	//     a[f]a[6]........................
494	//                             a[8]a[7]
495	// a[f]a[7]........................
496Lsqr8x_mul:
497	mul	x14,x6,x4
498	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
499	mul	x15,x7,x4
500	add	x27,x27,#8
501	mul	x16,x8,x4
502	mul	x17,x9,x4
503	adds	x19,x19,x14
504	mul	x14,x10,x4
505	adcs	x20,x20,x15
506	mul	x15,x11,x4
507	adcs	x21,x21,x16
508	mul	x16,x12,x4
509	adcs	x22,x22,x17
510	mul	x17,x13,x4
511	adcs	x23,x23,x14
512	umulh	x14,x6,x4
513	adcs	x24,x24,x15
514	umulh	x15,x7,x4
515	adcs	x25,x25,x16
516	umulh	x16,x8,x4
517	adcs	x26,x26,x17
518	umulh	x17,x9,x4
519	adc	x28,x28,xzr
520	str	x19,[x2],#8
521	adds	x19,x20,x14
522	umulh	x14,x10,x4
523	adcs	x20,x21,x15
524	umulh	x15,x11,x4
525	adcs	x21,x22,x16
526	umulh	x16,x12,x4
527	adcs	x22,x23,x17
528	umulh	x17,x13,x4
529	ldr	x4,[x0,x27]
530	adcs	x23,x24,x14
531	adcs	x24,x25,x15
532	adcs	x25,x26,x16
533	adcs	x26,x28,x17
534	//adc	x28,xzr,xzr		// moved above
535	cbnz	x27,Lsqr8x_mul
536					// note that carry flag is guaranteed
537					// to be zero at this point
538	cmp	x1,x3		// done yet?
539	b.eq	Lsqr8x_break
540
541	ldp	x6,x7,[x2,#8*0]
542	ldp	x8,x9,[x2,#8*2]
543	ldp	x10,x11,[x2,#8*4]
544	ldp	x12,x13,[x2,#8*6]
545	adds	x19,x19,x6
546	ldr	x4,[x0,#-8*8]
547	adcs	x20,x20,x7
548	ldp	x6,x7,[x1,#8*0]
549	adcs	x21,x21,x8
550	adcs	x22,x22,x9
551	ldp	x8,x9,[x1,#8*2]
552	adcs	x23,x23,x10
553	adcs	x24,x24,x11
554	ldp	x10,x11,[x1,#8*4]
555	adcs	x25,x25,x12
556	mov	x27,#-8*8
557	adcs	x26,x26,x13
558	ldp	x12,x13,[x1,#8*6]
559	add	x1,x1,#8*8
560	//adc	x28,xzr,xzr		// moved above
561	b	Lsqr8x_mul
562
563.align	4
564Lsqr8x_break:
565	ldp	x6,x7,[x0,#8*0]
566	add	x1,x0,#8*8
567	ldp	x8,x9,[x0,#8*2]
568	sub	x14,x3,x1		// is it last iteration?
569	ldp	x10,x11,[x0,#8*4]
570	sub	x15,x2,x14
571	ldp	x12,x13,[x0,#8*6]
572	cbz	x14,Lsqr8x_outer_loop
573
574	stp	x19,x20,[x2,#8*0]
575	ldp	x19,x20,[x15,#8*0]
576	stp	x21,x22,[x2,#8*2]
577	ldp	x21,x22,[x15,#8*2]
578	stp	x23,x24,[x2,#8*4]
579	ldp	x23,x24,[x15,#8*4]
580	stp	x25,x26,[x2,#8*6]
581	mov	x2,x15
582	ldp	x25,x26,[x15,#8*6]
583	b	Lsqr8x_outer_loop
584
585.align	4
586Lsqr8x_outer_break:
587	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
588	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
589	ldp	x15,x16,[sp,#8*1]
590	ldp	x11,x13,[x14,#8*2]
591	add	x1,x14,#8*4
592	ldp	x17,x14,[sp,#8*3]
593
594	stp	x19,x20,[x2,#8*0]
595	mul	x19,x7,x7
596	stp	x21,x22,[x2,#8*2]
597	umulh	x7,x7,x7
598	stp	x23,x24,[x2,#8*4]
599	mul	x8,x9,x9
600	stp	x25,x26,[x2,#8*6]
601	mov	x2,sp
602	umulh	x9,x9,x9
603	adds	x20,x7,x15,lsl#1
604	extr	x15,x16,x15,#63
605	sub	x27,x5,#8*4
606
607Lsqr4x_shift_n_add:
608	adcs	x21,x8,x15
609	extr	x16,x17,x16,#63
610	sub	x27,x27,#8*4
611	adcs	x22,x9,x16
612	ldp	x15,x16,[x2,#8*5]
613	mul	x10,x11,x11
614	ldp	x7,x9,[x1],#8*2
615	umulh	x11,x11,x11
616	mul	x12,x13,x13
617	umulh	x13,x13,x13
618	extr	x17,x14,x17,#63
619	stp	x19,x20,[x2,#8*0]
620	adcs	x23,x10,x17
621	extr	x14,x15,x14,#63
622	stp	x21,x22,[x2,#8*2]
623	adcs	x24,x11,x14
624	ldp	x17,x14,[x2,#8*7]
625	extr	x15,x16,x15,#63
626	adcs	x25,x12,x15
627	extr	x16,x17,x16,#63
628	adcs	x26,x13,x16
629	ldp	x15,x16,[x2,#8*9]
630	mul	x6,x7,x7
631	ldp	x11,x13,[x1],#8*2
632	umulh	x7,x7,x7
633	mul	x8,x9,x9
634	umulh	x9,x9,x9
635	stp	x23,x24,[x2,#8*4]
636	extr	x17,x14,x17,#63
637	stp	x25,x26,[x2,#8*6]
638	add	x2,x2,#8*8
639	adcs	x19,x6,x17
640	extr	x14,x15,x14,#63
641	adcs	x20,x7,x14
642	ldp	x17,x14,[x2,#8*3]
643	extr	x15,x16,x15,#63
644	cbnz	x27,Lsqr4x_shift_n_add
645	ldp	x1,x4,[x29,#104]	// pull np and n0
646
647	adcs	x21,x8,x15
648	extr	x16,x17,x16,#63
649	adcs	x22,x9,x16
650	ldp	x15,x16,[x2,#8*5]
651	mul	x10,x11,x11
652	umulh	x11,x11,x11
653	stp	x19,x20,[x2,#8*0]
654	mul	x12,x13,x13
655	umulh	x13,x13,x13
656	stp	x21,x22,[x2,#8*2]
657	extr	x17,x14,x17,#63
658	adcs	x23,x10,x17
659	extr	x14,x15,x14,#63
660	ldp	x19,x20,[sp,#8*0]
661	adcs	x24,x11,x14
662	extr	x15,x16,x15,#63
663	ldp	x6,x7,[x1,#8*0]
664	adcs	x25,x12,x15
665	extr	x16,xzr,x16,#63
666	ldp	x8,x9,[x1,#8*2]
667	adc	x26,x13,x16
668	ldp	x10,x11,[x1,#8*4]
669
670	// Reduce by 512 bits per iteration
671	mul	x28,x4,x19		// t[0]*n0
672	ldp	x12,x13,[x1,#8*6]
673	add	x3,x1,x5
674	ldp	x21,x22,[sp,#8*2]
675	stp	x23,x24,[x2,#8*4]
676	ldp	x23,x24,[sp,#8*4]
677	stp	x25,x26,[x2,#8*6]
678	ldp	x25,x26,[sp,#8*6]
679	add	x1,x1,#8*8
680	mov	x30,xzr		// initial top-most carry
681	mov	x2,sp
682	mov	x27,#8
683
684Lsqr8x_reduction:
685	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
686	mul	x15,x7,x28
687	sub	x27,x27,#1
688	mul	x16,x8,x28
689	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
690	mul	x17,x9,x28
691	// (*)	adds	xzr,x19,x14
692	subs	xzr,x19,#1		// (*)
693	mul	x14,x10,x28
694	adcs	x19,x20,x15
695	mul	x15,x11,x28
696	adcs	x20,x21,x16
697	mul	x16,x12,x28
698	adcs	x21,x22,x17
699	mul	x17,x13,x28
700	adcs	x22,x23,x14
701	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
702	adcs	x23,x24,x15
703	umulh	x15,x7,x28
704	adcs	x24,x25,x16
705	umulh	x16,x8,x28
706	adcs	x25,x26,x17
707	umulh	x17,x9,x28
708	adc	x26,xzr,xzr
709	adds	x19,x19,x14
710	umulh	x14,x10,x28
711	adcs	x20,x20,x15
712	umulh	x15,x11,x28
713	adcs	x21,x21,x16
714	umulh	x16,x12,x28
715	adcs	x22,x22,x17
716	umulh	x17,x13,x28
717	mul	x28,x4,x19		// next t[0]*n0
718	adcs	x23,x23,x14
719	adcs	x24,x24,x15
720	adcs	x25,x25,x16
721	adc	x26,x26,x17
722	cbnz	x27,Lsqr8x_reduction
723
724	ldp	x14,x15,[x2,#8*0]
725	ldp	x16,x17,[x2,#8*2]
726	mov	x0,x2
727	sub	x27,x3,x1	// done yet?
728	adds	x19,x19,x14
729	adcs	x20,x20,x15
730	ldp	x14,x15,[x2,#8*4]
731	adcs	x21,x21,x16
732	adcs	x22,x22,x17
733	ldp	x16,x17,[x2,#8*6]
734	adcs	x23,x23,x14
735	adcs	x24,x24,x15
736	adcs	x25,x25,x16
737	adcs	x26,x26,x17
738	//adc	x28,xzr,xzr		// moved below
739	cbz	x27,Lsqr8x8_post_condition
740
741	ldr	x4,[x2,#-8*8]
742	ldp	x6,x7,[x1,#8*0]
743	ldp	x8,x9,[x1,#8*2]
744	ldp	x10,x11,[x1,#8*4]
745	mov	x27,#-8*8
746	ldp	x12,x13,[x1,#8*6]
747	add	x1,x1,#8*8
748
749Lsqr8x_tail:
750	mul	x14,x6,x4
751	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
752	mul	x15,x7,x4
753	add	x27,x27,#8
754	mul	x16,x8,x4
755	mul	x17,x9,x4
756	adds	x19,x19,x14
757	mul	x14,x10,x4
758	adcs	x20,x20,x15
759	mul	x15,x11,x4
760	adcs	x21,x21,x16
761	mul	x16,x12,x4
762	adcs	x22,x22,x17
763	mul	x17,x13,x4
764	adcs	x23,x23,x14
765	umulh	x14,x6,x4
766	adcs	x24,x24,x15
767	umulh	x15,x7,x4
768	adcs	x25,x25,x16
769	umulh	x16,x8,x4
770	adcs	x26,x26,x17
771	umulh	x17,x9,x4
772	adc	x28,x28,xzr
773	str	x19,[x2],#8
774	adds	x19,x20,x14
775	umulh	x14,x10,x4
776	adcs	x20,x21,x15
777	umulh	x15,x11,x4
778	adcs	x21,x22,x16
779	umulh	x16,x12,x4
780	adcs	x22,x23,x17
781	umulh	x17,x13,x4
782	ldr	x4,[x0,x27]
783	adcs	x23,x24,x14
784	adcs	x24,x25,x15
785	adcs	x25,x26,x16
786	adcs	x26,x28,x17
787	//adc	x28,xzr,xzr		// moved above
788	cbnz	x27,Lsqr8x_tail
789					// note that carry flag is guaranteed
790					// to be zero at this point
791	ldp	x6,x7,[x2,#8*0]
792	sub	x27,x3,x1	// done yet?
793	sub	x16,x3,x5	// rewinded np
794	ldp	x8,x9,[x2,#8*2]
795	ldp	x10,x11,[x2,#8*4]
796	ldp	x12,x13,[x2,#8*6]
797	cbz	x27,Lsqr8x_tail_break
798
799	ldr	x4,[x0,#-8*8]
800	adds	x19,x19,x6
801	adcs	x20,x20,x7
802	ldp	x6,x7,[x1,#8*0]
803	adcs	x21,x21,x8
804	adcs	x22,x22,x9
805	ldp	x8,x9,[x1,#8*2]
806	adcs	x23,x23,x10
807	adcs	x24,x24,x11
808	ldp	x10,x11,[x1,#8*4]
809	adcs	x25,x25,x12
810	mov	x27,#-8*8
811	adcs	x26,x26,x13
812	ldp	x12,x13,[x1,#8*6]
813	add	x1,x1,#8*8
814	//adc	x28,xzr,xzr		// moved above
815	b	Lsqr8x_tail
816
817.align	4
818Lsqr8x_tail_break:
819	ldr	x4,[x29,#112]		// pull n0
820	add	x27,x2,#8*8		// end of current t[num] window
821
822	subs	xzr,x30,#1		// "move" top-most carry to carry bit
823	adcs	x14,x19,x6
824	adcs	x15,x20,x7
825	ldp	x19,x20,[x0,#8*0]
826	adcs	x21,x21,x8
827	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
828	adcs	x22,x22,x9
829	ldp	x8,x9,[x16,#8*2]
830	adcs	x23,x23,x10
831	adcs	x24,x24,x11
832	ldp	x10,x11,[x16,#8*4]
833	adcs	x25,x25,x12
834	adcs	x26,x26,x13
835	ldp	x12,x13,[x16,#8*6]
836	add	x1,x16,#8*8
837	adc	x30,xzr,xzr	// top-most carry
838	mul	x28,x4,x19
839	stp	x14,x15,[x2,#8*0]
840	stp	x21,x22,[x2,#8*2]
841	ldp	x21,x22,[x0,#8*2]
842	stp	x23,x24,[x2,#8*4]
843	ldp	x23,x24,[x0,#8*4]
844	cmp	x27,x29		// did we hit the bottom?
845	stp	x25,x26,[x2,#8*6]
846	mov	x2,x0			// slide the window
847	ldp	x25,x26,[x0,#8*6]
848	mov	x27,#8
849	b.ne	Lsqr8x_reduction
850
851	// Final step. We see if result is larger than modulus, and
852	// if it is, subtract the modulus. But comparison implies
853	// subtraction. So we subtract modulus, see if it borrowed,
854	// and conditionally copy original value.
855	ldr	x0,[x29,#96]		// pull rp
856	add	x2,x2,#8*8
857	subs	x14,x19,x6
858	sbcs	x15,x20,x7
859	sub	x27,x5,#8*8
860	mov	x3,x0		// x0 copy
861
862Lsqr8x_sub:
863	sbcs	x16,x21,x8
864	ldp	x6,x7,[x1,#8*0]
865	sbcs	x17,x22,x9
866	stp	x14,x15,[x0,#8*0]
867	sbcs	x14,x23,x10
868	ldp	x8,x9,[x1,#8*2]
869	sbcs	x15,x24,x11
870	stp	x16,x17,[x0,#8*2]
871	sbcs	x16,x25,x12
872	ldp	x10,x11,[x1,#8*4]
873	sbcs	x17,x26,x13
874	ldp	x12,x13,[x1,#8*6]
875	add	x1,x1,#8*8
876	ldp	x19,x20,[x2,#8*0]
877	sub	x27,x27,#8*8
878	ldp	x21,x22,[x2,#8*2]
879	ldp	x23,x24,[x2,#8*4]
880	ldp	x25,x26,[x2,#8*6]
881	add	x2,x2,#8*8
882	stp	x14,x15,[x0,#8*4]
883	sbcs	x14,x19,x6
884	stp	x16,x17,[x0,#8*6]
885	add	x0,x0,#8*8
886	sbcs	x15,x20,x7
887	cbnz	x27,Lsqr8x_sub
888
889	sbcs	x16,x21,x8
890	mov	x2,sp
891	add	x1,sp,x5
892	ldp	x6,x7,[x3,#8*0]
893	sbcs	x17,x22,x9
894	stp	x14,x15,[x0,#8*0]
895	sbcs	x14,x23,x10
896	ldp	x8,x9,[x3,#8*2]
897	sbcs	x15,x24,x11
898	stp	x16,x17,[x0,#8*2]
899	sbcs	x16,x25,x12
900	ldp	x19,x20,[x1,#8*0]
901	sbcs	x17,x26,x13
902	ldp	x21,x22,[x1,#8*2]
903	sbcs	xzr,x30,xzr	// did it borrow?
904	ldr	x30,[x29,#8]		// pull return address
905	stp	x14,x15,[x0,#8*4]
906	stp	x16,x17,[x0,#8*6]
907
908	sub	x27,x5,#8*4
909Lsqr4x_cond_copy:
910	sub	x27,x27,#8*4
911	csel	x14,x19,x6,lo
912	stp	xzr,xzr,[x2,#8*0]
913	csel	x15,x20,x7,lo
914	ldp	x6,x7,[x3,#8*4]
915	ldp	x19,x20,[x1,#8*4]
916	csel	x16,x21,x8,lo
917	stp	xzr,xzr,[x2,#8*2]
918	add	x2,x2,#8*4
919	csel	x17,x22,x9,lo
920	ldp	x8,x9,[x3,#8*6]
921	ldp	x21,x22,[x1,#8*6]
922	add	x1,x1,#8*4
923	stp	x14,x15,[x3,#8*0]
924	stp	x16,x17,[x3,#8*2]
925	add	x3,x3,#8*4
926	stp	xzr,xzr,[x1,#8*0]
927	stp	xzr,xzr,[x1,#8*2]
928	cbnz	x27,Lsqr4x_cond_copy
929
930	csel	x14,x19,x6,lo
931	stp	xzr,xzr,[x2,#8*0]
932	csel	x15,x20,x7,lo
933	stp	xzr,xzr,[x2,#8*2]
934	csel	x16,x21,x8,lo
935	csel	x17,x22,x9,lo
936	stp	x14,x15,[x3,#8*0]
937	stp	x16,x17,[x3,#8*2]
938
939	b	Lsqr8x_done
940
941.align	4
942Lsqr8x8_post_condition:
943	adc	x28,xzr,xzr
944	ldr	x30,[x29,#8]		// pull return address
945	// x19-7,x28 hold result, x6-7 hold modulus
946	subs	x6,x19,x6
947	ldr	x1,[x29,#96]		// pull rp
948	sbcs	x7,x20,x7
949	stp	xzr,xzr,[sp,#8*0]
950	sbcs	x8,x21,x8
951	stp	xzr,xzr,[sp,#8*2]
952	sbcs	x9,x22,x9
953	stp	xzr,xzr,[sp,#8*4]
954	sbcs	x10,x23,x10
955	stp	xzr,xzr,[sp,#8*6]
956	sbcs	x11,x24,x11
957	stp	xzr,xzr,[sp,#8*8]
958	sbcs	x12,x25,x12
959	stp	xzr,xzr,[sp,#8*10]
960	sbcs	x13,x26,x13
961	stp	xzr,xzr,[sp,#8*12]
962	sbcs	x28,x28,xzr	// did it borrow?
963	stp	xzr,xzr,[sp,#8*14]
964
965	// x6-7 hold result-modulus
966	csel	x6,x19,x6,lo
967	csel	x7,x20,x7,lo
968	csel	x8,x21,x8,lo
969	csel	x9,x22,x9,lo
970	stp	x6,x7,[x1,#8*0]
971	csel	x10,x23,x10,lo
972	csel	x11,x24,x11,lo
973	stp	x8,x9,[x1,#8*2]
974	csel	x12,x25,x12,lo
975	csel	x13,x26,x13,lo
976	stp	x10,x11,[x1,#8*4]
977	stp	x12,x13,[x1,#8*6]
978
979Lsqr8x_done:
980	ldp	x19,x20,[x29,#16]
981	mov	sp,x29
982	ldp	x21,x22,[x29,#32]
983	mov	x0,#1
984	ldp	x23,x24,[x29,#48]
985	ldp	x25,x26,[x29,#64]
986	ldp	x27,x28,[x29,#80]
987	ldr	x29,[sp],#128
988	// x30 is popped earlier
989	AARCH64_VALIDATE_LINK_REGISTER
990	ret
991
992.def __bn_mul4x_mont
993   .type 32
994.endef
995.align	5
996__bn_mul4x_mont:
997	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
998	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
999	// return address.
1000	stp	x29,x30,[sp,#-128]!
1001	add	x29,sp,#0
1002	stp	x19,x20,[sp,#16]
1003	stp	x21,x22,[sp,#32]
1004	stp	x23,x24,[sp,#48]
1005	stp	x25,x26,[sp,#64]
1006	stp	x27,x28,[sp,#80]
1007
1008	sub	x26,sp,x5,lsl#3
1009	lsl	x5,x5,#3
1010	ldr	x4,[x4]		// *n0
1011	sub	sp,x26,#8*4		// alloca
1012
1013	add	x10,x2,x5
1014	add	x27,x1,x5
1015	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1016
1017	ldr	x24,[x2,#8*0]		// b[0]
1018	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1019	ldp	x8,x9,[x1,#8*2]
1020	add	x1,x1,#8*4
1021	mov	x19,xzr
1022	mov	x20,xzr
1023	mov	x21,xzr
1024	mov	x22,xzr
1025	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1026	ldp	x16,x17,[x3,#8*2]
1027	adds	x3,x3,#8*4		// clear carry bit
1028	mov	x0,xzr
1029	mov	x28,#0
1030	mov	x26,sp
1031
1032Loop_mul4x_1st_reduction:
1033	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1034	adc	x0,x0,xzr	// modulo-scheduled
1035	mul	x11,x7,x24
1036	add	x28,x28,#8
1037	mul	x12,x8,x24
1038	and	x28,x28,#31
1039	mul	x13,x9,x24
1040	adds	x19,x19,x10
1041	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1042	adcs	x20,x20,x11
1043	mul	x25,x19,x4		// t[0]*n0
1044	adcs	x21,x21,x12
1045	umulh	x11,x7,x24
1046	adcs	x22,x22,x13
1047	umulh	x12,x8,x24
1048	adc	x23,xzr,xzr
1049	umulh	x13,x9,x24
1050	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1051	adds	x20,x20,x10
1052	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1053	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1054	adcs	x21,x21,x11
1055	mul	x11,x15,x25
1056	adcs	x22,x22,x12
1057	mul	x12,x16,x25
1058	adc	x23,x23,x13		// can't overflow
1059	mul	x13,x17,x25
1060	// (*)	adds	xzr,x19,x10
1061	subs	xzr,x19,#1		// (*)
1062	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1063	adcs	x19,x20,x11
1064	umulh	x11,x15,x25
1065	adcs	x20,x21,x12
1066	umulh	x12,x16,x25
1067	adcs	x21,x22,x13
1068	umulh	x13,x17,x25
1069	adcs	x22,x23,x0
1070	adc	x0,xzr,xzr
1071	adds	x19,x19,x10
1072	sub	x10,x27,x1
1073	adcs	x20,x20,x11
1074	adcs	x21,x21,x12
1075	adcs	x22,x22,x13
1076	//adc	x0,x0,xzr
1077	cbnz	x28,Loop_mul4x_1st_reduction
1078
1079	cbz	x10,Lmul4x4_post_condition
1080
1081	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1082	ldp	x8,x9,[x1,#8*2]
1083	add	x1,x1,#8*4
1084	ldr	x25,[sp]		// a[0]*n0
1085	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1086	ldp	x16,x17,[x3,#8*2]
1087	add	x3,x3,#8*4
1088
1089Loop_mul4x_1st_tail:
1090	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1091	adc	x0,x0,xzr	// modulo-scheduled
1092	mul	x11,x7,x24
1093	add	x28,x28,#8
1094	mul	x12,x8,x24
1095	and	x28,x28,#31
1096	mul	x13,x9,x24
1097	adds	x19,x19,x10
1098	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1099	adcs	x20,x20,x11
1100	umulh	x11,x7,x24
1101	adcs	x21,x21,x12
1102	umulh	x12,x8,x24
1103	adcs	x22,x22,x13
1104	umulh	x13,x9,x24
1105	adc	x23,xzr,xzr
1106	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1107	adds	x20,x20,x10
1108	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1109	adcs	x21,x21,x11
1110	mul	x11,x15,x25
1111	adcs	x22,x22,x12
1112	mul	x12,x16,x25
1113	adc	x23,x23,x13		// can't overflow
1114	mul	x13,x17,x25
1115	adds	x19,x19,x10
1116	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1117	adcs	x20,x20,x11
1118	umulh	x11,x15,x25
1119	adcs	x21,x21,x12
1120	umulh	x12,x16,x25
1121	adcs	x22,x22,x13
1122	adcs	x23,x23,x0
1123	umulh	x13,x17,x25
1124	adc	x0,xzr,xzr
1125	ldr	x25,[sp,x28]		// next t[0]*n0
1126	str	x19,[x26],#8		// result!!!
1127	adds	x19,x20,x10
1128	sub	x10,x27,x1		// done yet?
1129	adcs	x20,x21,x11
1130	adcs	x21,x22,x12
1131	adcs	x22,x23,x13
1132	//adc	x0,x0,xzr
1133	cbnz	x28,Loop_mul4x_1st_tail
1134
1135	sub	x11,x27,x5	// rewinded x1
1136	cbz	x10,Lmul4x_proceed
1137
1138	ldp	x6,x7,[x1,#8*0]
1139	ldp	x8,x9,[x1,#8*2]
1140	add	x1,x1,#8*4
1141	ldp	x14,x15,[x3,#8*0]
1142	ldp	x16,x17,[x3,#8*2]
1143	add	x3,x3,#8*4
1144	b	Loop_mul4x_1st_tail
1145
1146.align	5
1147Lmul4x_proceed:
1148	ldr	x24,[x2,#8*4]!		// *++b
1149	adc	x30,x0,xzr
1150	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1151	sub	x3,x3,x5		// rewind np
1152	ldp	x8,x9,[x11,#8*2]
1153	add	x1,x11,#8*4
1154
1155	stp	x19,x20,[x26,#8*0]	// result!!!
1156	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1157	stp	x21,x22,[x26,#8*2]	// result!!!
1158	ldp	x21,x22,[sp,#8*6]
1159
1160	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1161	mov	x26,sp
1162	ldp	x16,x17,[x3,#8*2]
1163	adds	x3,x3,#8*4		// clear carry bit
1164	mov	x0,xzr
1165
1166.align	4
1167Loop_mul4x_reduction:
1168	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1169	adc	x0,x0,xzr	// modulo-scheduled
1170	mul	x11,x7,x24
1171	add	x28,x28,#8
1172	mul	x12,x8,x24
1173	and	x28,x28,#31
1174	mul	x13,x9,x24
1175	adds	x19,x19,x10
1176	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1177	adcs	x20,x20,x11
1178	mul	x25,x19,x4		// t[0]*n0
1179	adcs	x21,x21,x12
1180	umulh	x11,x7,x24
1181	adcs	x22,x22,x13
1182	umulh	x12,x8,x24
1183	adc	x23,xzr,xzr
1184	umulh	x13,x9,x24
1185	ldr	x24,[x2,x28]		// next b[i]
1186	adds	x20,x20,x10
1187	// (*)	mul	x10,x14,x25
1188	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1189	adcs	x21,x21,x11
1190	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1191	adcs	x22,x22,x12
1192	mul	x12,x16,x25
1193	adc	x23,x23,x13		// can't overflow
1194	mul	x13,x17,x25
1195	// (*)	adds	xzr,x19,x10
1196	subs	xzr,x19,#1		// (*)
1197	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1198	adcs	x19,x20,x11
1199	umulh	x11,x15,x25
1200	adcs	x20,x21,x12
1201	umulh	x12,x16,x25
1202	adcs	x21,x22,x13
1203	umulh	x13,x17,x25
1204	adcs	x22,x23,x0
1205	adc	x0,xzr,xzr
1206	adds	x19,x19,x10
1207	adcs	x20,x20,x11
1208	adcs	x21,x21,x12
1209	adcs	x22,x22,x13
1210	//adc	x0,x0,xzr
1211	cbnz	x28,Loop_mul4x_reduction
1212
1213	adc	x0,x0,xzr
1214	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1215	ldp	x12,x13,[x26,#8*6]
1216	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1217	ldp	x8,x9,[x1,#8*2]
1218	add	x1,x1,#8*4
1219	adds	x19,x19,x10
1220	adcs	x20,x20,x11
1221	adcs	x21,x21,x12
1222	adcs	x22,x22,x13
1223	//adc	x0,x0,xzr
1224
1225	ldr	x25,[sp]		// t[0]*n0
1226	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1227	ldp	x16,x17,[x3,#8*2]
1228	add	x3,x3,#8*4
1229
1230.align	4
1231Loop_mul4x_tail:
1232	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1233	adc	x0,x0,xzr	// modulo-scheduled
1234	mul	x11,x7,x24
1235	add	x28,x28,#8
1236	mul	x12,x8,x24
1237	and	x28,x28,#31
1238	mul	x13,x9,x24
1239	adds	x19,x19,x10
1240	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1241	adcs	x20,x20,x11
1242	umulh	x11,x7,x24
1243	adcs	x21,x21,x12
1244	umulh	x12,x8,x24
1245	adcs	x22,x22,x13
1246	umulh	x13,x9,x24
1247	adc	x23,xzr,xzr
1248	ldr	x24,[x2,x28]		// next b[i]
1249	adds	x20,x20,x10
1250	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1251	adcs	x21,x21,x11
1252	mul	x11,x15,x25
1253	adcs	x22,x22,x12
1254	mul	x12,x16,x25
1255	adc	x23,x23,x13		// can't overflow
1256	mul	x13,x17,x25
1257	adds	x19,x19,x10
1258	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1259	adcs	x20,x20,x11
1260	umulh	x11,x15,x25
1261	adcs	x21,x21,x12
1262	umulh	x12,x16,x25
1263	adcs	x22,x22,x13
1264	umulh	x13,x17,x25
1265	adcs	x23,x23,x0
1266	ldr	x25,[sp,x28]		// next a[0]*n0
1267	adc	x0,xzr,xzr
1268	str	x19,[x26],#8		// result!!!
1269	adds	x19,x20,x10
1270	sub	x10,x27,x1		// done yet?
1271	adcs	x20,x21,x11
1272	adcs	x21,x22,x12
1273	adcs	x22,x23,x13
1274	//adc	x0,x0,xzr
1275	cbnz	x28,Loop_mul4x_tail
1276
1277	sub	x11,x3,x5		// rewinded np?
1278	adc	x0,x0,xzr
1279	cbz	x10,Loop_mul4x_break
1280
1281	ldp	x10,x11,[x26,#8*4]
1282	ldp	x12,x13,[x26,#8*6]
1283	ldp	x6,x7,[x1,#8*0]
1284	ldp	x8,x9,[x1,#8*2]
1285	add	x1,x1,#8*4
1286	adds	x19,x19,x10
1287	adcs	x20,x20,x11
1288	adcs	x21,x21,x12
1289	adcs	x22,x22,x13
1290	//adc	x0,x0,xzr
1291	ldp	x14,x15,[x3,#8*0]
1292	ldp	x16,x17,[x3,#8*2]
1293	add	x3,x3,#8*4
1294	b	Loop_mul4x_tail
1295
1296.align	4
1297Loop_mul4x_break:
1298	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1299	adds	x19,x19,x30
1300	add	x2,x2,#8*4		// bp++
1301	adcs	x20,x20,xzr
1302	sub	x1,x1,x5		// rewind ap
1303	adcs	x21,x21,xzr
1304	stp	x19,x20,[x26,#8*0]	// result!!!
1305	adcs	x22,x22,xzr
1306	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1307	adc	x30,x0,xzr
1308	stp	x21,x22,[x26,#8*2]	// result!!!
1309	cmp	x2,x13			// done yet?
1310	ldp	x21,x22,[sp,#8*6]
1311	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1312	ldp	x16,x17,[x11,#8*2]
1313	add	x3,x11,#8*4
1314	b.eq	Lmul4x_post
1315
1316	ldr	x24,[x2]
1317	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1318	ldp	x8,x9,[x1,#8*2]
1319	adds	x1,x1,#8*4		// clear carry bit
1320	mov	x0,xzr
1321	mov	x26,sp
1322	b	Loop_mul4x_reduction
1323
1324.align	4
1325Lmul4x_post:
1326	// Final step. We see if result is larger than modulus, and
1327	// if it is, subtract the modulus. But comparison implies
1328	// subtraction. So we subtract modulus, see if it borrowed,
1329	// and conditionally copy original value.
1330	mov	x0,x12
1331	mov	x27,x12		// x0 copy
1332	subs	x10,x19,x14
1333	add	x26,sp,#8*8
1334	sbcs	x11,x20,x15
1335	sub	x28,x5,#8*4
1336
1337Lmul4x_sub:
1338	sbcs	x12,x21,x16
1339	ldp	x14,x15,[x3,#8*0]
1340	sub	x28,x28,#8*4
1341	ldp	x19,x20,[x26,#8*0]
1342	sbcs	x13,x22,x17
1343	ldp	x16,x17,[x3,#8*2]
1344	add	x3,x3,#8*4
1345	ldp	x21,x22,[x26,#8*2]
1346	add	x26,x26,#8*4
1347	stp	x10,x11,[x0,#8*0]
1348	sbcs	x10,x19,x14
1349	stp	x12,x13,[x0,#8*2]
1350	add	x0,x0,#8*4
1351	sbcs	x11,x20,x15
1352	cbnz	x28,Lmul4x_sub
1353
1354	sbcs	x12,x21,x16
1355	mov	x26,sp
1356	add	x1,sp,#8*4
1357	ldp	x6,x7,[x27,#8*0]
1358	sbcs	x13,x22,x17
1359	stp	x10,x11,[x0,#8*0]
1360	ldp	x8,x9,[x27,#8*2]
1361	stp	x12,x13,[x0,#8*2]
1362	ldp	x19,x20,[x1,#8*0]
1363	ldp	x21,x22,[x1,#8*2]
1364	sbcs	xzr,x30,xzr	// did it borrow?
1365	ldr	x30,[x29,#8]		// pull return address
1366
1367	sub	x28,x5,#8*4
1368Lmul4x_cond_copy:
1369	sub	x28,x28,#8*4
1370	csel	x10,x19,x6,lo
1371	stp	xzr,xzr,[x26,#8*0]
1372	csel	x11,x20,x7,lo
1373	ldp	x6,x7,[x27,#8*4]
1374	ldp	x19,x20,[x1,#8*4]
1375	csel	x12,x21,x8,lo
1376	stp	xzr,xzr,[x26,#8*2]
1377	add	x26,x26,#8*4
1378	csel	x13,x22,x9,lo
1379	ldp	x8,x9,[x27,#8*6]
1380	ldp	x21,x22,[x1,#8*6]
1381	add	x1,x1,#8*4
1382	stp	x10,x11,[x27,#8*0]
1383	stp	x12,x13,[x27,#8*2]
1384	add	x27,x27,#8*4
1385	cbnz	x28,Lmul4x_cond_copy
1386
1387	csel	x10,x19,x6,lo
1388	stp	xzr,xzr,[x26,#8*0]
1389	csel	x11,x20,x7,lo
1390	stp	xzr,xzr,[x26,#8*2]
1391	csel	x12,x21,x8,lo
1392	stp	xzr,xzr,[x26,#8*3]
1393	csel	x13,x22,x9,lo
1394	stp	xzr,xzr,[x26,#8*4]
1395	stp	x10,x11,[x27,#8*0]
1396	stp	x12,x13,[x27,#8*2]
1397
1398	b	Lmul4x_done
1399
1400.align	4
1401Lmul4x4_post_condition:
1402	adc	x0,x0,xzr
1403	ldr	x1,[x29,#96]		// pull rp
1404	// x19-3,x0 hold result, x14-7 hold modulus
1405	subs	x6,x19,x14
1406	ldr	x30,[x29,#8]		// pull return address
1407	sbcs	x7,x20,x15
1408	stp	xzr,xzr,[sp,#8*0]
1409	sbcs	x8,x21,x16
1410	stp	xzr,xzr,[sp,#8*2]
1411	sbcs	x9,x22,x17
1412	stp	xzr,xzr,[sp,#8*4]
1413	sbcs	xzr,x0,xzr		// did it borrow?
1414	stp	xzr,xzr,[sp,#8*6]
1415
1416	// x6-3 hold result-modulus
1417	csel	x6,x19,x6,lo
1418	csel	x7,x20,x7,lo
1419	csel	x8,x21,x8,lo
1420	csel	x9,x22,x9,lo
1421	stp	x6,x7,[x1,#8*0]
1422	stp	x8,x9,[x1,#8*2]
1423
1424Lmul4x_done:
1425	ldp	x19,x20,[x29,#16]
1426	mov	sp,x29
1427	ldp	x21,x22,[x29,#32]
1428	mov	x0,#1
1429	ldp	x23,x24,[x29,#48]
1430	ldp	x25,x26,[x29,#64]
1431	ldp	x27,x28,[x29,#80]
1432	ldr	x29,[sp],#128
1433	// x30 is popped earlier
1434	AARCH64_VALIDATE_LINK_REGISTER
1435	ret
1436
1437.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1438.align	2
1439.align	4
1440#endif
1441#endif  // !OPENSSL_NO_ASM
1442