• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include "ring_core_generated/prefix_symbols_asm.h"
14#include <ring-core/arm_arch.h>
15
16.text
17
18.globl	bn_mul_mont
19.hidden	bn_mul_mont
20.type	bn_mul_mont,%function
21.align	5
22bn_mul_mont:
23	AARCH64_SIGN_LINK_REGISTER
24	tst	x5,#7
25	b.eq	__bn_sqr8x_mont
26	tst	x5,#3
27	b.eq	__bn_mul4x_mont
28.Lmul_mont:
29	stp	x29,x30,[sp,#-64]!
30	add	x29,sp,#0
31	stp	x19,x20,[sp,#16]
32	stp	x21,x22,[sp,#32]
33	stp	x23,x24,[sp,#48]
34
35	ldr	x9,[x2],#8		// bp[0]
36	sub	x22,sp,x5,lsl#3
37	ldp	x7,x8,[x1],#16	// ap[0..1]
38	lsl	x5,x5,#3
39	ldr	x4,[x4]		// *n0
40	and	x22,x22,#-16		// ABI says so
41	ldp	x13,x14,[x3],#16	// np[0..1]
42
43	mul	x6,x7,x9		// ap[0]*bp[0]
44	sub	x21,x5,#16		// j=num-2
45	umulh	x7,x7,x9
46	mul	x10,x8,x9		// ap[1]*bp[0]
47	umulh	x11,x8,x9
48
49	mul	x15,x6,x4		// "tp[0]"*n0
50	mov	sp,x22			// alloca
51
52	// (*)	mul	x12,x13,x15	// np[0]*m1
53	umulh	x13,x13,x15
54	mul	x16,x14,x15		// np[1]*m1
55	// (*)	adds	x12,x12,x6	// discarded
56	// (*)	As for removal of first multiplication and addition
57	//	instructions. The outcome of first addition is
58	//	guaranteed to be zero, which leaves two computationally
59	//	significant outcomes: it either carries or not. Then
60	//	question is when does it carry? Is there alternative
61	//	way to deduce it? If you follow operations, you can
62	//	observe that condition for carry is quite simple:
63	//	x6 being non-zero. So that carry can be calculated
64	//	by adding -1 to x6. That's what next instruction does.
65	subs	xzr,x6,#1		// (*)
66	umulh	x17,x14,x15
67	adc	x13,x13,xzr
68	cbz	x21,.L1st_skip
69
70.L1st:
71	ldr	x8,[x1],#8
72	adds	x6,x10,x7
73	sub	x21,x21,#8		// j--
74	adc	x7,x11,xzr
75
76	ldr	x14,[x3],#8
77	adds	x12,x16,x13
78	mul	x10,x8,x9		// ap[j]*bp[0]
79	adc	x13,x17,xzr
80	umulh	x11,x8,x9
81
82	adds	x12,x12,x6
83	mul	x16,x14,x15		// np[j]*m1
84	adc	x13,x13,xzr
85	umulh	x17,x14,x15
86	str	x12,[x22],#8		// tp[j-1]
87	cbnz	x21,.L1st
88
89.L1st_skip:
90	adds	x6,x10,x7
91	sub	x1,x1,x5		// rewind x1
92	adc	x7,x11,xzr
93
94	adds	x12,x16,x13
95	sub	x3,x3,x5		// rewind x3
96	adc	x13,x17,xzr
97
98	adds	x12,x12,x6
99	sub	x20,x5,#8		// i=num-1
100	adcs	x13,x13,x7
101
102	adc	x19,xzr,xzr		// upmost overflow bit
103	stp	x12,x13,[x22]
104
105.Louter:
106	ldr	x9,[x2],#8		// bp[i]
107	ldp	x7,x8,[x1],#16
108	ldr	x23,[sp]		// tp[0]
109	add	x22,sp,#8
110
111	mul	x6,x7,x9		// ap[0]*bp[i]
112	sub	x21,x5,#16		// j=num-2
113	umulh	x7,x7,x9
114	ldp	x13,x14,[x3],#16
115	mul	x10,x8,x9		// ap[1]*bp[i]
116	adds	x6,x6,x23
117	umulh	x11,x8,x9
118	adc	x7,x7,xzr
119
120	mul	x15,x6,x4
121	sub	x20,x20,#8		// i--
122
123	// (*)	mul	x12,x13,x15	// np[0]*m1
124	umulh	x13,x13,x15
125	mul	x16,x14,x15		// np[1]*m1
126	// (*)	adds	x12,x12,x6
127	subs	xzr,x6,#1		// (*)
128	umulh	x17,x14,x15
129	cbz	x21,.Linner_skip
130
131.Linner:
132	ldr	x8,[x1],#8
133	adc	x13,x13,xzr
134	ldr	x23,[x22],#8		// tp[j]
135	adds	x6,x10,x7
136	sub	x21,x21,#8		// j--
137	adc	x7,x11,xzr
138
139	adds	x12,x16,x13
140	ldr	x14,[x3],#8
141	adc	x13,x17,xzr
142
143	mul	x10,x8,x9		// ap[j]*bp[i]
144	adds	x6,x6,x23
145	umulh	x11,x8,x9
146	adc	x7,x7,xzr
147
148	mul	x16,x14,x15		// np[j]*m1
149	adds	x12,x12,x6
150	umulh	x17,x14,x15
151	str	x12,[x22,#-16]		// tp[j-1]
152	cbnz	x21,.Linner
153
154.Linner_skip:
155	ldr	x23,[x22],#8		// tp[j]
156	adc	x13,x13,xzr
157	adds	x6,x10,x7
158	sub	x1,x1,x5		// rewind x1
159	adc	x7,x11,xzr
160
161	adds	x12,x16,x13
162	sub	x3,x3,x5		// rewind x3
163	adcs	x13,x17,x19
164	adc	x19,xzr,xzr
165
166	adds	x6,x6,x23
167	adc	x7,x7,xzr
168
169	adds	x12,x12,x6
170	adcs	x13,x13,x7
171	adc	x19,x19,xzr		// upmost overflow bit
172	stp	x12,x13,[x22,#-16]
173
174	cbnz	x20,.Louter
175
176	// Final step. We see if result is larger than modulus, and
177	// if it is, subtract the modulus. But comparison implies
178	// subtraction. So we subtract modulus, see if it borrowed,
179	// and conditionally copy original value.
180	ldr	x23,[sp]		// tp[0]
181	add	x22,sp,#8
182	ldr	x14,[x3],#8		// np[0]
183	subs	x21,x5,#8		// j=num-1 and clear borrow
184	mov	x1,x0
185.Lsub:
186	sbcs	x8,x23,x14		// tp[j]-np[j]
187	ldr	x23,[x22],#8
188	sub	x21,x21,#8		// j--
189	ldr	x14,[x3],#8
190	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
191	cbnz	x21,.Lsub
192
193	sbcs	x8,x23,x14
194	sbcs	x19,x19,xzr		// did it borrow?
195	str	x8,[x1],#8		// rp[num-1]
196
197	ldr	x23,[sp]		// tp[0]
198	add	x22,sp,#8
199	ldr	x8,[x0],#8		// rp[0]
200	sub	x5,x5,#8		// num--
201	nop
202.Lcond_copy:
203	sub	x5,x5,#8		// num--
204	csel	x14,x23,x8,lo		// did it borrow?
205	ldr	x23,[x22],#8
206	ldr	x8,[x0],#8
207	str	xzr,[x22,#-16]		// wipe tp
208	str	x14,[x0,#-16]
209	cbnz	x5,.Lcond_copy
210
211	csel	x14,x23,x8,lo
212	str	xzr,[x22,#-8]		// wipe tp
213	str	x14,[x0,#-8]
214
215	ldp	x19,x20,[x29,#16]
216	mov	sp,x29
217	ldp	x21,x22,[x29,#32]
218	mov	x0,#1
219	ldp	x23,x24,[x29,#48]
220	ldr	x29,[sp],#64
221	AARCH64_VALIDATE_LINK_REGISTER
222	ret
223.size	bn_mul_mont,.-bn_mul_mont
224.type	__bn_sqr8x_mont,%function
225.align	5
226__bn_sqr8x_mont:
227	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
228	// only from bn_mul_mont which has already signed the return address.
229	cmp	x1,x2
230	b.ne	__bn_mul4x_mont
231.Lsqr8x_mont:
232	stp	x29,x30,[sp,#-128]!
233	add	x29,sp,#0
234	stp	x19,x20,[sp,#16]
235	stp	x21,x22,[sp,#32]
236	stp	x23,x24,[sp,#48]
237	stp	x25,x26,[sp,#64]
238	stp	x27,x28,[sp,#80]
239	stp	x0,x3,[sp,#96]	// offload rp and np
240
241	ldp	x6,x7,[x1,#8*0]
242	ldp	x8,x9,[x1,#8*2]
243	ldp	x10,x11,[x1,#8*4]
244	ldp	x12,x13,[x1,#8*6]
245
246	sub	x2,sp,x5,lsl#4
247	lsl	x5,x5,#3
248	ldr	x4,[x4]		// *n0
249	mov	sp,x2			// alloca
250	sub	x27,x5,#8*8
251	b	.Lsqr8x_zero_start
252
253.Lsqr8x_zero:
254	sub	x27,x27,#8*8
255	stp	xzr,xzr,[x2,#8*0]
256	stp	xzr,xzr,[x2,#8*2]
257	stp	xzr,xzr,[x2,#8*4]
258	stp	xzr,xzr,[x2,#8*6]
259.Lsqr8x_zero_start:
260	stp	xzr,xzr,[x2,#8*8]
261	stp	xzr,xzr,[x2,#8*10]
262	stp	xzr,xzr,[x2,#8*12]
263	stp	xzr,xzr,[x2,#8*14]
264	add	x2,x2,#8*16
265	cbnz	x27,.Lsqr8x_zero
266
267	add	x3,x1,x5
268	add	x1,x1,#8*8
269	mov	x19,xzr
270	mov	x20,xzr
271	mov	x21,xzr
272	mov	x22,xzr
273	mov	x23,xzr
274	mov	x24,xzr
275	mov	x25,xzr
276	mov	x26,xzr
277	mov	x2,sp
278	str	x4,[x29,#112]		// offload n0
279
280	// Multiply everything but a[i]*a[i]
281.align	4
282.Lsqr8x_outer_loop:
283        //                                                 a[1]a[0]	(i)
284        //                                             a[2]a[0]
285        //                                         a[3]a[0]
286        //                                     a[4]a[0]
287        //                                 a[5]a[0]
288        //                             a[6]a[0]
289        //                         a[7]a[0]
290        //                                         a[2]a[1]		(ii)
291        //                                     a[3]a[1]
292        //                                 a[4]a[1]
293        //                             a[5]a[1]
294        //                         a[6]a[1]
295        //                     a[7]a[1]
296        //                                 a[3]a[2]			(iii)
297        //                             a[4]a[2]
298        //                         a[5]a[2]
299        //                     a[6]a[2]
300        //                 a[7]a[2]
301        //                         a[4]a[3]				(iv)
302        //                     a[5]a[3]
303        //                 a[6]a[3]
304        //             a[7]a[3]
305        //                 a[5]a[4]					(v)
306        //             a[6]a[4]
307        //         a[7]a[4]
308        //         a[6]a[5]						(vi)
309        //     a[7]a[5]
310        // a[7]a[6]							(vii)
311
312	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
313	mul	x15,x8,x6
314	mul	x16,x9,x6
315	mul	x17,x10,x6
316	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
317	mul	x14,x11,x6
318	adcs	x21,x21,x15
319	mul	x15,x12,x6
320	adcs	x22,x22,x16
321	mul	x16,x13,x6
322	adcs	x23,x23,x17
323	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
324	adcs	x24,x24,x14
325	umulh	x14,x8,x6
326	adcs	x25,x25,x15
327	umulh	x15,x9,x6
328	adcs	x26,x26,x16
329	umulh	x16,x10,x6
330	stp	x19,x20,[x2],#8*2	// t[0..1]
331	adc	x19,xzr,xzr		// t[8]
332	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
333	umulh	x17,x11,x6
334	adcs	x22,x22,x14
335	umulh	x14,x12,x6
336	adcs	x23,x23,x15
337	umulh	x15,x13,x6
338	adcs	x24,x24,x16
339	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
340	adcs	x25,x25,x17
341	mul	x17,x9,x7
342	adcs	x26,x26,x14
343	mul	x14,x10,x7
344	adc	x19,x19,x15
345
346	mul	x15,x11,x7
347	adds	x22,x22,x16
348	mul	x16,x12,x7
349	adcs	x23,x23,x17
350	mul	x17,x13,x7
351	adcs	x24,x24,x14
352	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
353	adcs	x25,x25,x15
354	umulh	x15,x9,x7
355	adcs	x26,x26,x16
356	umulh	x16,x10,x7
357	adcs	x19,x19,x17
358	umulh	x17,x11,x7
359	stp	x21,x22,[x2],#8*2	// t[2..3]
360	adc	x20,xzr,xzr		// t[9]
361	adds	x23,x23,x14
362	umulh	x14,x12,x7
363	adcs	x24,x24,x15
364	umulh	x15,x13,x7
365	adcs	x25,x25,x16
366	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
367	adcs	x26,x26,x17
368	mul	x17,x10,x8
369	adcs	x19,x19,x14
370	mul	x14,x11,x8
371	adc	x20,x20,x15
372
373	mul	x15,x12,x8
374	adds	x24,x24,x16
375	mul	x16,x13,x8
376	adcs	x25,x25,x17
377	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
378	adcs	x26,x26,x14
379	umulh	x14,x10,x8
380	adcs	x19,x19,x15
381	umulh	x15,x11,x8
382	adcs	x20,x20,x16
383	umulh	x16,x12,x8
384	stp	x23,x24,[x2],#8*2	// t[4..5]
385	adc	x21,xzr,xzr		// t[10]
386	adds	x25,x25,x17
387	umulh	x17,x13,x8
388	adcs	x26,x26,x14
389	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
390	adcs	x19,x19,x15
391	mul	x15,x11,x9
392	adcs	x20,x20,x16
393	mul	x16,x12,x9
394	adc	x21,x21,x17
395
396	mul	x17,x13,x9
397	adds	x26,x26,x14
398	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
399	adcs	x19,x19,x15
400	umulh	x15,x11,x9
401	adcs	x20,x20,x16
402	umulh	x16,x12,x9
403	adcs	x21,x21,x17
404	umulh	x17,x13,x9
405	stp	x25,x26,[x2],#8*2	// t[6..7]
406	adc	x22,xzr,xzr		// t[11]
407	adds	x19,x19,x14
408	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
409	adcs	x20,x20,x15
410	mul	x15,x12,x10
411	adcs	x21,x21,x16
412	mul	x16,x13,x10
413	adc	x22,x22,x17
414
415	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
416	adds	x20,x20,x14
417	umulh	x14,x12,x10
418	adcs	x21,x21,x15
419	umulh	x15,x13,x10
420	adcs	x22,x22,x16
421	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
422	adc	x23,xzr,xzr		// t[12]
423	adds	x21,x21,x17
424	mul	x17,x13,x11
425	adcs	x22,x22,x14
426	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
427	adc	x23,x23,x15
428
429	umulh	x15,x13,x11
430	adds	x22,x22,x16
431	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
432	adcs	x23,x23,x17
433	umulh	x17,x13,x12		// hi(a[7]*a[6])
434	adc	x24,xzr,xzr		// t[13]
435	adds	x23,x23,x14
436	sub	x27,x3,x1	// done yet?
437	adc	x24,x24,x15
438
439	adds	x24,x24,x16
440	sub	x14,x3,x5	// rewinded ap
441	adc	x25,xzr,xzr		// t[14]
442	add	x25,x25,x17
443
444	cbz	x27,.Lsqr8x_outer_break
445
446	mov	x4,x6
447	ldp	x6,x7,[x2,#8*0]
448	ldp	x8,x9,[x2,#8*2]
449	ldp	x10,x11,[x2,#8*4]
450	ldp	x12,x13,[x2,#8*6]
451	adds	x19,x19,x6
452	adcs	x20,x20,x7
453	ldp	x6,x7,[x1,#8*0]
454	adcs	x21,x21,x8
455	adcs	x22,x22,x9
456	ldp	x8,x9,[x1,#8*2]
457	adcs	x23,x23,x10
458	adcs	x24,x24,x11
459	ldp	x10,x11,[x1,#8*4]
460	adcs	x25,x25,x12
461	mov	x0,x1
462	adcs	x26,xzr,x13
463	ldp	x12,x13,[x1,#8*6]
464	add	x1,x1,#8*8
465	//adc	x28,xzr,xzr		// moved below
466	mov	x27,#-8*8
467
468	//                                                         a[8]a[0]
469	//                                                     a[9]a[0]
470	//                                                 a[a]a[0]
471	//                                             a[b]a[0]
472	//                                         a[c]a[0]
473	//                                     a[d]a[0]
474	//                                 a[e]a[0]
475	//                             a[f]a[0]
476	//                                                     a[8]a[1]
477	//                         a[f]a[1]........................
478	//                                                 a[8]a[2]
479	//                     a[f]a[2]........................
480	//                                             a[8]a[3]
481	//                 a[f]a[3]........................
482	//                                         a[8]a[4]
483	//             a[f]a[4]........................
484	//                                     a[8]a[5]
485	//         a[f]a[5]........................
486	//                                 a[8]a[6]
487	//     a[f]a[6]........................
488	//                             a[8]a[7]
489	// a[f]a[7]........................
490.Lsqr8x_mul:
491	mul	x14,x6,x4
492	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
493	mul	x15,x7,x4
494	add	x27,x27,#8
495	mul	x16,x8,x4
496	mul	x17,x9,x4
497	adds	x19,x19,x14
498	mul	x14,x10,x4
499	adcs	x20,x20,x15
500	mul	x15,x11,x4
501	adcs	x21,x21,x16
502	mul	x16,x12,x4
503	adcs	x22,x22,x17
504	mul	x17,x13,x4
505	adcs	x23,x23,x14
506	umulh	x14,x6,x4
507	adcs	x24,x24,x15
508	umulh	x15,x7,x4
509	adcs	x25,x25,x16
510	umulh	x16,x8,x4
511	adcs	x26,x26,x17
512	umulh	x17,x9,x4
513	adc	x28,x28,xzr
514	str	x19,[x2],#8
515	adds	x19,x20,x14
516	umulh	x14,x10,x4
517	adcs	x20,x21,x15
518	umulh	x15,x11,x4
519	adcs	x21,x22,x16
520	umulh	x16,x12,x4
521	adcs	x22,x23,x17
522	umulh	x17,x13,x4
523	ldr	x4,[x0,x27]
524	adcs	x23,x24,x14
525	adcs	x24,x25,x15
526	adcs	x25,x26,x16
527	adcs	x26,x28,x17
528	//adc	x28,xzr,xzr		// moved above
529	cbnz	x27,.Lsqr8x_mul
530					// note that carry flag is guaranteed
531					// to be zero at this point
532	cmp	x1,x3		// done yet?
533	b.eq	.Lsqr8x_break
534
535	ldp	x6,x7,[x2,#8*0]
536	ldp	x8,x9,[x2,#8*2]
537	ldp	x10,x11,[x2,#8*4]
538	ldp	x12,x13,[x2,#8*6]
539	adds	x19,x19,x6
540	ldr	x4,[x0,#-8*8]
541	adcs	x20,x20,x7
542	ldp	x6,x7,[x1,#8*0]
543	adcs	x21,x21,x8
544	adcs	x22,x22,x9
545	ldp	x8,x9,[x1,#8*2]
546	adcs	x23,x23,x10
547	adcs	x24,x24,x11
548	ldp	x10,x11,[x1,#8*4]
549	adcs	x25,x25,x12
550	mov	x27,#-8*8
551	adcs	x26,x26,x13
552	ldp	x12,x13,[x1,#8*6]
553	add	x1,x1,#8*8
554	//adc	x28,xzr,xzr		// moved above
555	b	.Lsqr8x_mul
556
557.align	4
558.Lsqr8x_break:
559	ldp	x6,x7,[x0,#8*0]
560	add	x1,x0,#8*8
561	ldp	x8,x9,[x0,#8*2]
562	sub	x14,x3,x1		// is it last iteration?
563	ldp	x10,x11,[x0,#8*4]
564	sub	x15,x2,x14
565	ldp	x12,x13,[x0,#8*6]
566	cbz	x14,.Lsqr8x_outer_loop
567
568	stp	x19,x20,[x2,#8*0]
569	ldp	x19,x20,[x15,#8*0]
570	stp	x21,x22,[x2,#8*2]
571	ldp	x21,x22,[x15,#8*2]
572	stp	x23,x24,[x2,#8*4]
573	ldp	x23,x24,[x15,#8*4]
574	stp	x25,x26,[x2,#8*6]
575	mov	x2,x15
576	ldp	x25,x26,[x15,#8*6]
577	b	.Lsqr8x_outer_loop
578
579.align	4
580.Lsqr8x_outer_break:
581	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
582	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
583	ldp	x15,x16,[sp,#8*1]
584	ldp	x11,x13,[x14,#8*2]
585	add	x1,x14,#8*4
586	ldp	x17,x14,[sp,#8*3]
587
588	stp	x19,x20,[x2,#8*0]
589	mul	x19,x7,x7
590	stp	x21,x22,[x2,#8*2]
591	umulh	x7,x7,x7
592	stp	x23,x24,[x2,#8*4]
593	mul	x8,x9,x9
594	stp	x25,x26,[x2,#8*6]
595	mov	x2,sp
596	umulh	x9,x9,x9
597	adds	x20,x7,x15,lsl#1
598	extr	x15,x16,x15,#63
599	sub	x27,x5,#8*4
600
601.Lsqr4x_shift_n_add:
602	adcs	x21,x8,x15
603	extr	x16,x17,x16,#63
604	sub	x27,x27,#8*4
605	adcs	x22,x9,x16
606	ldp	x15,x16,[x2,#8*5]
607	mul	x10,x11,x11
608	ldp	x7,x9,[x1],#8*2
609	umulh	x11,x11,x11
610	mul	x12,x13,x13
611	umulh	x13,x13,x13
612	extr	x17,x14,x17,#63
613	stp	x19,x20,[x2,#8*0]
614	adcs	x23,x10,x17
615	extr	x14,x15,x14,#63
616	stp	x21,x22,[x2,#8*2]
617	adcs	x24,x11,x14
618	ldp	x17,x14,[x2,#8*7]
619	extr	x15,x16,x15,#63
620	adcs	x25,x12,x15
621	extr	x16,x17,x16,#63
622	adcs	x26,x13,x16
623	ldp	x15,x16,[x2,#8*9]
624	mul	x6,x7,x7
625	ldp	x11,x13,[x1],#8*2
626	umulh	x7,x7,x7
627	mul	x8,x9,x9
628	umulh	x9,x9,x9
629	stp	x23,x24,[x2,#8*4]
630	extr	x17,x14,x17,#63
631	stp	x25,x26,[x2,#8*6]
632	add	x2,x2,#8*8
633	adcs	x19,x6,x17
634	extr	x14,x15,x14,#63
635	adcs	x20,x7,x14
636	ldp	x17,x14,[x2,#8*3]
637	extr	x15,x16,x15,#63
638	cbnz	x27,.Lsqr4x_shift_n_add
639	ldp	x1,x4,[x29,#104]	// pull np and n0
640
641	adcs	x21,x8,x15
642	extr	x16,x17,x16,#63
643	adcs	x22,x9,x16
644	ldp	x15,x16,[x2,#8*5]
645	mul	x10,x11,x11
646	umulh	x11,x11,x11
647	stp	x19,x20,[x2,#8*0]
648	mul	x12,x13,x13
649	umulh	x13,x13,x13
650	stp	x21,x22,[x2,#8*2]
651	extr	x17,x14,x17,#63
652	adcs	x23,x10,x17
653	extr	x14,x15,x14,#63
654	ldp	x19,x20,[sp,#8*0]
655	adcs	x24,x11,x14
656	extr	x15,x16,x15,#63
657	ldp	x6,x7,[x1,#8*0]
658	adcs	x25,x12,x15
659	extr	x16,xzr,x16,#63
660	ldp	x8,x9,[x1,#8*2]
661	adc	x26,x13,x16
662	ldp	x10,x11,[x1,#8*4]
663
664	// Reduce by 512 bits per iteration
665	mul	x28,x4,x19		// t[0]*n0
666	ldp	x12,x13,[x1,#8*6]
667	add	x3,x1,x5
668	ldp	x21,x22,[sp,#8*2]
669	stp	x23,x24,[x2,#8*4]
670	ldp	x23,x24,[sp,#8*4]
671	stp	x25,x26,[x2,#8*6]
672	ldp	x25,x26,[sp,#8*6]
673	add	x1,x1,#8*8
674	mov	x30,xzr		// initial top-most carry
675	mov	x2,sp
676	mov	x27,#8
677
678.Lsqr8x_reduction:
679	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
680	mul	x15,x7,x28
681	sub	x27,x27,#1
682	mul	x16,x8,x28
683	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
684	mul	x17,x9,x28
685	// (*)	adds	xzr,x19,x14
686	subs	xzr,x19,#1		// (*)
687	mul	x14,x10,x28
688	adcs	x19,x20,x15
689	mul	x15,x11,x28
690	adcs	x20,x21,x16
691	mul	x16,x12,x28
692	adcs	x21,x22,x17
693	mul	x17,x13,x28
694	adcs	x22,x23,x14
695	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
696	adcs	x23,x24,x15
697	umulh	x15,x7,x28
698	adcs	x24,x25,x16
699	umulh	x16,x8,x28
700	adcs	x25,x26,x17
701	umulh	x17,x9,x28
702	adc	x26,xzr,xzr
703	adds	x19,x19,x14
704	umulh	x14,x10,x28
705	adcs	x20,x20,x15
706	umulh	x15,x11,x28
707	adcs	x21,x21,x16
708	umulh	x16,x12,x28
709	adcs	x22,x22,x17
710	umulh	x17,x13,x28
711	mul	x28,x4,x19		// next t[0]*n0
712	adcs	x23,x23,x14
713	adcs	x24,x24,x15
714	adcs	x25,x25,x16
715	adc	x26,x26,x17
716	cbnz	x27,.Lsqr8x_reduction
717
718	ldp	x14,x15,[x2,#8*0]
719	ldp	x16,x17,[x2,#8*2]
720	mov	x0,x2
721	sub	x27,x3,x1	// done yet?
722	adds	x19,x19,x14
723	adcs	x20,x20,x15
724	ldp	x14,x15,[x2,#8*4]
725	adcs	x21,x21,x16
726	adcs	x22,x22,x17
727	ldp	x16,x17,[x2,#8*6]
728	adcs	x23,x23,x14
729	adcs	x24,x24,x15
730	adcs	x25,x25,x16
731	adcs	x26,x26,x17
732	//adc	x28,xzr,xzr		// moved below
733	cbz	x27,.Lsqr8x8_post_condition
734
735	ldr	x4,[x2,#-8*8]
736	ldp	x6,x7,[x1,#8*0]
737	ldp	x8,x9,[x1,#8*2]
738	ldp	x10,x11,[x1,#8*4]
739	mov	x27,#-8*8
740	ldp	x12,x13,[x1,#8*6]
741	add	x1,x1,#8*8
742
743.Lsqr8x_tail:
744	mul	x14,x6,x4
745	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
746	mul	x15,x7,x4
747	add	x27,x27,#8
748	mul	x16,x8,x4
749	mul	x17,x9,x4
750	adds	x19,x19,x14
751	mul	x14,x10,x4
752	adcs	x20,x20,x15
753	mul	x15,x11,x4
754	adcs	x21,x21,x16
755	mul	x16,x12,x4
756	adcs	x22,x22,x17
757	mul	x17,x13,x4
758	adcs	x23,x23,x14
759	umulh	x14,x6,x4
760	adcs	x24,x24,x15
761	umulh	x15,x7,x4
762	adcs	x25,x25,x16
763	umulh	x16,x8,x4
764	adcs	x26,x26,x17
765	umulh	x17,x9,x4
766	adc	x28,x28,xzr
767	str	x19,[x2],#8
768	adds	x19,x20,x14
769	umulh	x14,x10,x4
770	adcs	x20,x21,x15
771	umulh	x15,x11,x4
772	adcs	x21,x22,x16
773	umulh	x16,x12,x4
774	adcs	x22,x23,x17
775	umulh	x17,x13,x4
776	ldr	x4,[x0,x27]
777	adcs	x23,x24,x14
778	adcs	x24,x25,x15
779	adcs	x25,x26,x16
780	adcs	x26,x28,x17
781	//adc	x28,xzr,xzr		// moved above
782	cbnz	x27,.Lsqr8x_tail
783					// note that carry flag is guaranteed
784					// to be zero at this point
785	ldp	x6,x7,[x2,#8*0]
786	sub	x27,x3,x1	// done yet?
787	sub	x16,x3,x5	// rewinded np
788	ldp	x8,x9,[x2,#8*2]
789	ldp	x10,x11,[x2,#8*4]
790	ldp	x12,x13,[x2,#8*6]
791	cbz	x27,.Lsqr8x_tail_break
792
793	ldr	x4,[x0,#-8*8]
794	adds	x19,x19,x6
795	adcs	x20,x20,x7
796	ldp	x6,x7,[x1,#8*0]
797	adcs	x21,x21,x8
798	adcs	x22,x22,x9
799	ldp	x8,x9,[x1,#8*2]
800	adcs	x23,x23,x10
801	adcs	x24,x24,x11
802	ldp	x10,x11,[x1,#8*4]
803	adcs	x25,x25,x12
804	mov	x27,#-8*8
805	adcs	x26,x26,x13
806	ldp	x12,x13,[x1,#8*6]
807	add	x1,x1,#8*8
808	//adc	x28,xzr,xzr		// moved above
809	b	.Lsqr8x_tail
810
811.align	4
812.Lsqr8x_tail_break:
813	ldr	x4,[x29,#112]		// pull n0
814	add	x27,x2,#8*8		// end of current t[num] window
815
816	subs	xzr,x30,#1		// "move" top-most carry to carry bit
817	adcs	x14,x19,x6
818	adcs	x15,x20,x7
819	ldp	x19,x20,[x0,#8*0]
820	adcs	x21,x21,x8
821	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
822	adcs	x22,x22,x9
823	ldp	x8,x9,[x16,#8*2]
824	adcs	x23,x23,x10
825	adcs	x24,x24,x11
826	ldp	x10,x11,[x16,#8*4]
827	adcs	x25,x25,x12
828	adcs	x26,x26,x13
829	ldp	x12,x13,[x16,#8*6]
830	add	x1,x16,#8*8
831	adc	x30,xzr,xzr	// top-most carry
832	mul	x28,x4,x19
833	stp	x14,x15,[x2,#8*0]
834	stp	x21,x22,[x2,#8*2]
835	ldp	x21,x22,[x0,#8*2]
836	stp	x23,x24,[x2,#8*4]
837	ldp	x23,x24,[x0,#8*4]
838	cmp	x27,x29		// did we hit the bottom?
839	stp	x25,x26,[x2,#8*6]
840	mov	x2,x0			// slide the window
841	ldp	x25,x26,[x0,#8*6]
842	mov	x27,#8
843	b.ne	.Lsqr8x_reduction
844
845	// Final step. We see if result is larger than modulus, and
846	// if it is, subtract the modulus. But comparison implies
847	// subtraction. So we subtract modulus, see if it borrowed,
848	// and conditionally copy original value.
849	ldr	x0,[x29,#96]		// pull rp
850	add	x2,x2,#8*8
851	subs	x14,x19,x6
852	sbcs	x15,x20,x7
853	sub	x27,x5,#8*8
854	mov	x3,x0		// x0 copy
855
856.Lsqr8x_sub:
857	sbcs	x16,x21,x8
858	ldp	x6,x7,[x1,#8*0]
859	sbcs	x17,x22,x9
860	stp	x14,x15,[x0,#8*0]
861	sbcs	x14,x23,x10
862	ldp	x8,x9,[x1,#8*2]
863	sbcs	x15,x24,x11
864	stp	x16,x17,[x0,#8*2]
865	sbcs	x16,x25,x12
866	ldp	x10,x11,[x1,#8*4]
867	sbcs	x17,x26,x13
868	ldp	x12,x13,[x1,#8*6]
869	add	x1,x1,#8*8
870	ldp	x19,x20,[x2,#8*0]
871	sub	x27,x27,#8*8
872	ldp	x21,x22,[x2,#8*2]
873	ldp	x23,x24,[x2,#8*4]
874	ldp	x25,x26,[x2,#8*6]
875	add	x2,x2,#8*8
876	stp	x14,x15,[x0,#8*4]
877	sbcs	x14,x19,x6
878	stp	x16,x17,[x0,#8*6]
879	add	x0,x0,#8*8
880	sbcs	x15,x20,x7
881	cbnz	x27,.Lsqr8x_sub
882
883	sbcs	x16,x21,x8
884	mov	x2,sp
885	add	x1,sp,x5
886	ldp	x6,x7,[x3,#8*0]
887	sbcs	x17,x22,x9
888	stp	x14,x15,[x0,#8*0]
889	sbcs	x14,x23,x10
890	ldp	x8,x9,[x3,#8*2]
891	sbcs	x15,x24,x11
892	stp	x16,x17,[x0,#8*2]
893	sbcs	x16,x25,x12
894	ldp	x19,x20,[x1,#8*0]
895	sbcs	x17,x26,x13
896	ldp	x21,x22,[x1,#8*2]
897	sbcs	xzr,x30,xzr	// did it borrow?
898	ldr	x30,[x29,#8]		// pull return address
899	stp	x14,x15,[x0,#8*4]
900	stp	x16,x17,[x0,#8*6]
901
902	sub	x27,x5,#8*4
903.Lsqr4x_cond_copy:
904	sub	x27,x27,#8*4
905	csel	x14,x19,x6,lo
906	stp	xzr,xzr,[x2,#8*0]
907	csel	x15,x20,x7,lo
908	ldp	x6,x7,[x3,#8*4]
909	ldp	x19,x20,[x1,#8*4]
910	csel	x16,x21,x8,lo
911	stp	xzr,xzr,[x2,#8*2]
912	add	x2,x2,#8*4
913	csel	x17,x22,x9,lo
914	ldp	x8,x9,[x3,#8*6]
915	ldp	x21,x22,[x1,#8*6]
916	add	x1,x1,#8*4
917	stp	x14,x15,[x3,#8*0]
918	stp	x16,x17,[x3,#8*2]
919	add	x3,x3,#8*4
920	stp	xzr,xzr,[x1,#8*0]
921	stp	xzr,xzr,[x1,#8*2]
922	cbnz	x27,.Lsqr4x_cond_copy
923
924	csel	x14,x19,x6,lo
925	stp	xzr,xzr,[x2,#8*0]
926	csel	x15,x20,x7,lo
927	stp	xzr,xzr,[x2,#8*2]
928	csel	x16,x21,x8,lo
929	csel	x17,x22,x9,lo
930	stp	x14,x15,[x3,#8*0]
931	stp	x16,x17,[x3,#8*2]
932
933	b	.Lsqr8x_done
934
935.align	4
936.Lsqr8x8_post_condition:
937	adc	x28,xzr,xzr
938	ldr	x30,[x29,#8]		// pull return address
939	// x19-7,x28 hold result, x6-7 hold modulus
940	subs	x6,x19,x6
941	ldr	x1,[x29,#96]		// pull rp
942	sbcs	x7,x20,x7
943	stp	xzr,xzr,[sp,#8*0]
944	sbcs	x8,x21,x8
945	stp	xzr,xzr,[sp,#8*2]
946	sbcs	x9,x22,x9
947	stp	xzr,xzr,[sp,#8*4]
948	sbcs	x10,x23,x10
949	stp	xzr,xzr,[sp,#8*6]
950	sbcs	x11,x24,x11
951	stp	xzr,xzr,[sp,#8*8]
952	sbcs	x12,x25,x12
953	stp	xzr,xzr,[sp,#8*10]
954	sbcs	x13,x26,x13
955	stp	xzr,xzr,[sp,#8*12]
956	sbcs	x28,x28,xzr	// did it borrow?
957	stp	xzr,xzr,[sp,#8*14]
958
959	// x6-7 hold result-modulus
960	csel	x6,x19,x6,lo
961	csel	x7,x20,x7,lo
962	csel	x8,x21,x8,lo
963	csel	x9,x22,x9,lo
964	stp	x6,x7,[x1,#8*0]
965	csel	x10,x23,x10,lo
966	csel	x11,x24,x11,lo
967	stp	x8,x9,[x1,#8*2]
968	csel	x12,x25,x12,lo
969	csel	x13,x26,x13,lo
970	stp	x10,x11,[x1,#8*4]
971	stp	x12,x13,[x1,#8*6]
972
973.Lsqr8x_done:
974	ldp	x19,x20,[x29,#16]
975	mov	sp,x29
976	ldp	x21,x22,[x29,#32]
977	mov	x0,#1
978	ldp	x23,x24,[x29,#48]
979	ldp	x25,x26,[x29,#64]
980	ldp	x27,x28,[x29,#80]
981	ldr	x29,[sp],#128
982	// x30 is popped earlier
983	AARCH64_VALIDATE_LINK_REGISTER
984	ret
985.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
986.type	__bn_mul4x_mont,%function
987.align	5
988__bn_mul4x_mont:
989	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
990	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
991	// return address.
992	stp	x29,x30,[sp,#-128]!
993	add	x29,sp,#0
994	stp	x19,x20,[sp,#16]
995	stp	x21,x22,[sp,#32]
996	stp	x23,x24,[sp,#48]
997	stp	x25,x26,[sp,#64]
998	stp	x27,x28,[sp,#80]
999
1000	sub	x26,sp,x5,lsl#3
1001	lsl	x5,x5,#3
1002	ldr	x4,[x4]		// *n0
1003	sub	sp,x26,#8*4		// alloca
1004
1005	add	x10,x2,x5
1006	add	x27,x1,x5
1007	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1008
1009	ldr	x24,[x2,#8*0]		// b[0]
1010	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1011	ldp	x8,x9,[x1,#8*2]
1012	add	x1,x1,#8*4
1013	mov	x19,xzr
1014	mov	x20,xzr
1015	mov	x21,xzr
1016	mov	x22,xzr
1017	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1018	ldp	x16,x17,[x3,#8*2]
1019	adds	x3,x3,#8*4		// clear carry bit
1020	mov	x0,xzr
1021	mov	x28,#0
1022	mov	x26,sp
1023
1024.Loop_mul4x_1st_reduction:
1025	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1026	adc	x0,x0,xzr	// modulo-scheduled
1027	mul	x11,x7,x24
1028	add	x28,x28,#8
1029	mul	x12,x8,x24
1030	and	x28,x28,#31
1031	mul	x13,x9,x24
1032	adds	x19,x19,x10
1033	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1034	adcs	x20,x20,x11
1035	mul	x25,x19,x4		// t[0]*n0
1036	adcs	x21,x21,x12
1037	umulh	x11,x7,x24
1038	adcs	x22,x22,x13
1039	umulh	x12,x8,x24
1040	adc	x23,xzr,xzr
1041	umulh	x13,x9,x24
1042	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1043	adds	x20,x20,x10
1044	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1045	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1046	adcs	x21,x21,x11
1047	mul	x11,x15,x25
1048	adcs	x22,x22,x12
1049	mul	x12,x16,x25
1050	adc	x23,x23,x13		// can't overflow
1051	mul	x13,x17,x25
1052	// (*)	adds	xzr,x19,x10
1053	subs	xzr,x19,#1		// (*)
1054	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1055	adcs	x19,x20,x11
1056	umulh	x11,x15,x25
1057	adcs	x20,x21,x12
1058	umulh	x12,x16,x25
1059	adcs	x21,x22,x13
1060	umulh	x13,x17,x25
1061	adcs	x22,x23,x0
1062	adc	x0,xzr,xzr
1063	adds	x19,x19,x10
1064	sub	x10,x27,x1
1065	adcs	x20,x20,x11
1066	adcs	x21,x21,x12
1067	adcs	x22,x22,x13
1068	//adc	x0,x0,xzr
1069	cbnz	x28,.Loop_mul4x_1st_reduction
1070
1071	cbz	x10,.Lmul4x4_post_condition
1072
1073	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1074	ldp	x8,x9,[x1,#8*2]
1075	add	x1,x1,#8*4
1076	ldr	x25,[sp]		// a[0]*n0
1077	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1078	ldp	x16,x17,[x3,#8*2]
1079	add	x3,x3,#8*4
1080
1081.Loop_mul4x_1st_tail:
1082	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1083	adc	x0,x0,xzr	// modulo-scheduled
1084	mul	x11,x7,x24
1085	add	x28,x28,#8
1086	mul	x12,x8,x24
1087	and	x28,x28,#31
1088	mul	x13,x9,x24
1089	adds	x19,x19,x10
1090	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1091	adcs	x20,x20,x11
1092	umulh	x11,x7,x24
1093	adcs	x21,x21,x12
1094	umulh	x12,x8,x24
1095	adcs	x22,x22,x13
1096	umulh	x13,x9,x24
1097	adc	x23,xzr,xzr
1098	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1099	adds	x20,x20,x10
1100	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1101	adcs	x21,x21,x11
1102	mul	x11,x15,x25
1103	adcs	x22,x22,x12
1104	mul	x12,x16,x25
1105	adc	x23,x23,x13		// can't overflow
1106	mul	x13,x17,x25
1107	adds	x19,x19,x10
1108	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1109	adcs	x20,x20,x11
1110	umulh	x11,x15,x25
1111	adcs	x21,x21,x12
1112	umulh	x12,x16,x25
1113	adcs	x22,x22,x13
1114	adcs	x23,x23,x0
1115	umulh	x13,x17,x25
1116	adc	x0,xzr,xzr
1117	ldr	x25,[sp,x28]		// next t[0]*n0
1118	str	x19,[x26],#8		// result!!!
1119	adds	x19,x20,x10
1120	sub	x10,x27,x1		// done yet?
1121	adcs	x20,x21,x11
1122	adcs	x21,x22,x12
1123	adcs	x22,x23,x13
1124	//adc	x0,x0,xzr
1125	cbnz	x28,.Loop_mul4x_1st_tail
1126
1127	sub	x11,x27,x5	// rewinded x1
1128	cbz	x10,.Lmul4x_proceed
1129
1130	ldp	x6,x7,[x1,#8*0]
1131	ldp	x8,x9,[x1,#8*2]
1132	add	x1,x1,#8*4
1133	ldp	x14,x15,[x3,#8*0]
1134	ldp	x16,x17,[x3,#8*2]
1135	add	x3,x3,#8*4
1136	b	.Loop_mul4x_1st_tail
1137
1138.align	5
1139.Lmul4x_proceed:
1140	ldr	x24,[x2,#8*4]!		// *++b
1141	adc	x30,x0,xzr
1142	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1143	sub	x3,x3,x5		// rewind np
1144	ldp	x8,x9,[x11,#8*2]
1145	add	x1,x11,#8*4
1146
1147	stp	x19,x20,[x26,#8*0]	// result!!!
1148	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1149	stp	x21,x22,[x26,#8*2]	// result!!!
1150	ldp	x21,x22,[sp,#8*6]
1151
1152	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1153	mov	x26,sp
1154	ldp	x16,x17,[x3,#8*2]
1155	adds	x3,x3,#8*4		// clear carry bit
1156	mov	x0,xzr
1157
1158.align	4
1159.Loop_mul4x_reduction:
1160	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1161	adc	x0,x0,xzr	// modulo-scheduled
1162	mul	x11,x7,x24
1163	add	x28,x28,#8
1164	mul	x12,x8,x24
1165	and	x28,x28,#31
1166	mul	x13,x9,x24
1167	adds	x19,x19,x10
1168	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1169	adcs	x20,x20,x11
1170	mul	x25,x19,x4		// t[0]*n0
1171	adcs	x21,x21,x12
1172	umulh	x11,x7,x24
1173	adcs	x22,x22,x13
1174	umulh	x12,x8,x24
1175	adc	x23,xzr,xzr
1176	umulh	x13,x9,x24
1177	ldr	x24,[x2,x28]		// next b[i]
1178	adds	x20,x20,x10
1179	// (*)	mul	x10,x14,x25
1180	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1181	adcs	x21,x21,x11
1182	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1183	adcs	x22,x22,x12
1184	mul	x12,x16,x25
1185	adc	x23,x23,x13		// can't overflow
1186	mul	x13,x17,x25
1187	// (*)	adds	xzr,x19,x10
1188	subs	xzr,x19,#1		// (*)
1189	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1190	adcs	x19,x20,x11
1191	umulh	x11,x15,x25
1192	adcs	x20,x21,x12
1193	umulh	x12,x16,x25
1194	adcs	x21,x22,x13
1195	umulh	x13,x17,x25
1196	adcs	x22,x23,x0
1197	adc	x0,xzr,xzr
1198	adds	x19,x19,x10
1199	adcs	x20,x20,x11
1200	adcs	x21,x21,x12
1201	adcs	x22,x22,x13
1202	//adc	x0,x0,xzr
1203	cbnz	x28,.Loop_mul4x_reduction
1204
1205	adc	x0,x0,xzr
1206	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1207	ldp	x12,x13,[x26,#8*6]
1208	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1209	ldp	x8,x9,[x1,#8*2]
1210	add	x1,x1,#8*4
1211	adds	x19,x19,x10
1212	adcs	x20,x20,x11
1213	adcs	x21,x21,x12
1214	adcs	x22,x22,x13
1215	//adc	x0,x0,xzr
1216
1217	ldr	x25,[sp]		// t[0]*n0
1218	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1219	ldp	x16,x17,[x3,#8*2]
1220	add	x3,x3,#8*4
1221
1222.align	4
1223.Loop_mul4x_tail:
1224	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1225	adc	x0,x0,xzr	// modulo-scheduled
1226	mul	x11,x7,x24
1227	add	x28,x28,#8
1228	mul	x12,x8,x24
1229	and	x28,x28,#31
1230	mul	x13,x9,x24
1231	adds	x19,x19,x10
1232	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1233	adcs	x20,x20,x11
1234	umulh	x11,x7,x24
1235	adcs	x21,x21,x12
1236	umulh	x12,x8,x24
1237	adcs	x22,x22,x13
1238	umulh	x13,x9,x24
1239	adc	x23,xzr,xzr
1240	ldr	x24,[x2,x28]		// next b[i]
1241	adds	x20,x20,x10
1242	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1243	adcs	x21,x21,x11
1244	mul	x11,x15,x25
1245	adcs	x22,x22,x12
1246	mul	x12,x16,x25
1247	adc	x23,x23,x13		// can't overflow
1248	mul	x13,x17,x25
1249	adds	x19,x19,x10
1250	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1251	adcs	x20,x20,x11
1252	umulh	x11,x15,x25
1253	adcs	x21,x21,x12
1254	umulh	x12,x16,x25
1255	adcs	x22,x22,x13
1256	umulh	x13,x17,x25
1257	adcs	x23,x23,x0
1258	ldr	x25,[sp,x28]		// next a[0]*n0
1259	adc	x0,xzr,xzr
1260	str	x19,[x26],#8		// result!!!
1261	adds	x19,x20,x10
1262	sub	x10,x27,x1		// done yet?
1263	adcs	x20,x21,x11
1264	adcs	x21,x22,x12
1265	adcs	x22,x23,x13
1266	//adc	x0,x0,xzr
1267	cbnz	x28,.Loop_mul4x_tail
1268
1269	sub	x11,x3,x5		// rewinded np?
1270	adc	x0,x0,xzr
1271	cbz	x10,.Loop_mul4x_break
1272
1273	ldp	x10,x11,[x26,#8*4]
1274	ldp	x12,x13,[x26,#8*6]
1275	ldp	x6,x7,[x1,#8*0]
1276	ldp	x8,x9,[x1,#8*2]
1277	add	x1,x1,#8*4
1278	adds	x19,x19,x10
1279	adcs	x20,x20,x11
1280	adcs	x21,x21,x12
1281	adcs	x22,x22,x13
1282	//adc	x0,x0,xzr
1283	ldp	x14,x15,[x3,#8*0]
1284	ldp	x16,x17,[x3,#8*2]
1285	add	x3,x3,#8*4
1286	b	.Loop_mul4x_tail
1287
1288.align	4
1289.Loop_mul4x_break:
1290	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1291	adds	x19,x19,x30
1292	add	x2,x2,#8*4		// bp++
1293	adcs	x20,x20,xzr
1294	sub	x1,x1,x5		// rewind ap
1295	adcs	x21,x21,xzr
1296	stp	x19,x20,[x26,#8*0]	// result!!!
1297	adcs	x22,x22,xzr
1298	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1299	adc	x30,x0,xzr
1300	stp	x21,x22,[x26,#8*2]	// result!!!
1301	cmp	x2,x13			// done yet?
1302	ldp	x21,x22,[sp,#8*6]
1303	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1304	ldp	x16,x17,[x11,#8*2]
1305	add	x3,x11,#8*4
1306	b.eq	.Lmul4x_post
1307
1308	ldr	x24,[x2]
1309	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1310	ldp	x8,x9,[x1,#8*2]
1311	adds	x1,x1,#8*4		// clear carry bit
1312	mov	x0,xzr
1313	mov	x26,sp
1314	b	.Loop_mul4x_reduction
1315
1316.align	4
1317.Lmul4x_post:
1318	// Final step. We see if result is larger than modulus, and
1319	// if it is, subtract the modulus. But comparison implies
1320	// subtraction. So we subtract modulus, see if it borrowed,
1321	// and conditionally copy original value.
1322	mov	x0,x12
1323	mov	x27,x12		// x0 copy
1324	subs	x10,x19,x14
1325	add	x26,sp,#8*8
1326	sbcs	x11,x20,x15
1327	sub	x28,x5,#8*4
1328
1329.Lmul4x_sub:
1330	sbcs	x12,x21,x16
1331	ldp	x14,x15,[x3,#8*0]
1332	sub	x28,x28,#8*4
1333	ldp	x19,x20,[x26,#8*0]
1334	sbcs	x13,x22,x17
1335	ldp	x16,x17,[x3,#8*2]
1336	add	x3,x3,#8*4
1337	ldp	x21,x22,[x26,#8*2]
1338	add	x26,x26,#8*4
1339	stp	x10,x11,[x0,#8*0]
1340	sbcs	x10,x19,x14
1341	stp	x12,x13,[x0,#8*2]
1342	add	x0,x0,#8*4
1343	sbcs	x11,x20,x15
1344	cbnz	x28,.Lmul4x_sub
1345
1346	sbcs	x12,x21,x16
1347	mov	x26,sp
1348	add	x1,sp,#8*4
1349	ldp	x6,x7,[x27,#8*0]
1350	sbcs	x13,x22,x17
1351	stp	x10,x11,[x0,#8*0]
1352	ldp	x8,x9,[x27,#8*2]
1353	stp	x12,x13,[x0,#8*2]
1354	ldp	x19,x20,[x1,#8*0]
1355	ldp	x21,x22,[x1,#8*2]
1356	sbcs	xzr,x30,xzr	// did it borrow?
1357	ldr	x30,[x29,#8]		// pull return address
1358
1359	sub	x28,x5,#8*4
1360.Lmul4x_cond_copy:
1361	sub	x28,x28,#8*4
1362	csel	x10,x19,x6,lo
1363	stp	xzr,xzr,[x26,#8*0]
1364	csel	x11,x20,x7,lo
1365	ldp	x6,x7,[x27,#8*4]
1366	ldp	x19,x20,[x1,#8*4]
1367	csel	x12,x21,x8,lo
1368	stp	xzr,xzr,[x26,#8*2]
1369	add	x26,x26,#8*4
1370	csel	x13,x22,x9,lo
1371	ldp	x8,x9,[x27,#8*6]
1372	ldp	x21,x22,[x1,#8*6]
1373	add	x1,x1,#8*4
1374	stp	x10,x11,[x27,#8*0]
1375	stp	x12,x13,[x27,#8*2]
1376	add	x27,x27,#8*4
1377	cbnz	x28,.Lmul4x_cond_copy
1378
1379	csel	x10,x19,x6,lo
1380	stp	xzr,xzr,[x26,#8*0]
1381	csel	x11,x20,x7,lo
1382	stp	xzr,xzr,[x26,#8*2]
1383	csel	x12,x21,x8,lo
1384	stp	xzr,xzr,[x26,#8*3]
1385	csel	x13,x22,x9,lo
1386	stp	xzr,xzr,[x26,#8*4]
1387	stp	x10,x11,[x27,#8*0]
1388	stp	x12,x13,[x27,#8*2]
1389
1390	b	.Lmul4x_done
1391
1392.align	4
1393.Lmul4x4_post_condition:
1394	adc	x0,x0,xzr
1395	ldr	x1,[x29,#96]		// pull rp
1396	// x19-3,x0 hold result, x14-7 hold modulus
1397	subs	x6,x19,x14
1398	ldr	x30,[x29,#8]		// pull return address
1399	sbcs	x7,x20,x15
1400	stp	xzr,xzr,[sp,#8*0]
1401	sbcs	x8,x21,x16
1402	stp	xzr,xzr,[sp,#8*2]
1403	sbcs	x9,x22,x17
1404	stp	xzr,xzr,[sp,#8*4]
1405	sbcs	xzr,x0,xzr		// did it borrow?
1406	stp	xzr,xzr,[sp,#8*6]
1407
1408	// x6-3 hold result-modulus
1409	csel	x6,x19,x6,lo
1410	csel	x7,x20,x7,lo
1411	csel	x8,x21,x8,lo
1412	csel	x9,x22,x9,lo
1413	stp	x6,x7,[x1,#8*0]
1414	stp	x8,x9,[x1,#8*2]
1415
1416.Lmul4x_done:
1417	ldp	x19,x20,[x29,#16]
1418	mov	sp,x29
1419	ldp	x21,x22,[x29,#32]
1420	mov	x0,#1
1421	ldp	x23,x24,[x29,#48]
1422	ldp	x25,x26,[x29,#64]
1423	ldp	x27,x28,[x29,#80]
1424	ldr	x29,[sp],#128
1425	// x30 is popped earlier
1426	AARCH64_VALIDATE_LINK_REGISTER
1427	ret
1428.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1429.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1430.align	2
1431.align	4
1432#endif
1433#endif  // !OPENSSL_NO_ASM
1434.section	.note.GNU-stack,"",%progbits
1435