• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18.text
19
20.globl	bn_mul_mont
21.hidden	bn_mul_mont
22.type	bn_mul_mont,%function
23.align	5
24bn_mul_mont:
25	AARCH64_SIGN_LINK_REGISTER
26	tst	x5,#7
27	b.eq	__bn_sqr8x_mont
28	tst	x5,#3
29	b.eq	__bn_mul4x_mont
30.Lmul_mont:
31	stp	x29,x30,[sp,#-64]!
32	add	x29,sp,#0
33	stp	x19,x20,[sp,#16]
34	stp	x21,x22,[sp,#32]
35	stp	x23,x24,[sp,#48]
36
37	ldr	x9,[x2],#8		// bp[0]
38	sub	x22,sp,x5,lsl#3
39	ldp	x7,x8,[x1],#16	// ap[0..1]
40	lsl	x5,x5,#3
41	ldr	x4,[x4]		// *n0
42	and	x22,x22,#-16		// ABI says so
43	ldp	x13,x14,[x3],#16	// np[0..1]
44
45	mul	x6,x7,x9		// ap[0]*bp[0]
46	sub	x21,x5,#16		// j=num-2
47	umulh	x7,x7,x9
48	mul	x10,x8,x9		// ap[1]*bp[0]
49	umulh	x11,x8,x9
50
51	mul	x15,x6,x4		// "tp[0]"*n0
52	mov	sp,x22			// alloca
53
54	// (*)	mul	x12,x13,x15	// np[0]*m1
55	umulh	x13,x13,x15
56	mul	x16,x14,x15		// np[1]*m1
57	// (*)	adds	x12,x12,x6	// discarded
58	// (*)	As for removal of first multiplication and addition
59	//	instructions. The outcome of first addition is
60	//	guaranteed to be zero, which leaves two computationally
61	//	significant outcomes: it either carries or not. Then
62	//	question is when does it carry? Is there alternative
63	//	way to deduce it? If you follow operations, you can
64	//	observe that condition for carry is quite simple:
65	//	x6 being non-zero. So that carry can be calculated
66	//	by adding -1 to x6. That's what next instruction does.
67	subs	xzr,x6,#1		// (*)
68	umulh	x17,x14,x15
69	adc	x13,x13,xzr
70	cbz	x21,.L1st_skip
71
72.L1st:
73	ldr	x8,[x1],#8
74	adds	x6,x10,x7
75	sub	x21,x21,#8		// j--
76	adc	x7,x11,xzr
77
78	ldr	x14,[x3],#8
79	adds	x12,x16,x13
80	mul	x10,x8,x9		// ap[j]*bp[0]
81	adc	x13,x17,xzr
82	umulh	x11,x8,x9
83
84	adds	x12,x12,x6
85	mul	x16,x14,x15		// np[j]*m1
86	adc	x13,x13,xzr
87	umulh	x17,x14,x15
88	str	x12,[x22],#8		// tp[j-1]
89	cbnz	x21,.L1st
90
91.L1st_skip:
92	adds	x6,x10,x7
93	sub	x1,x1,x5		// rewind x1
94	adc	x7,x11,xzr
95
96	adds	x12,x16,x13
97	sub	x3,x3,x5		// rewind x3
98	adc	x13,x17,xzr
99
100	adds	x12,x12,x6
101	sub	x20,x5,#8		// i=num-1
102	adcs	x13,x13,x7
103
104	adc	x19,xzr,xzr		// upmost overflow bit
105	stp	x12,x13,[x22]
106
107.Louter:
108	ldr	x9,[x2],#8		// bp[i]
109	ldp	x7,x8,[x1],#16
110	ldr	x23,[sp]		// tp[0]
111	add	x22,sp,#8
112
113	mul	x6,x7,x9		// ap[0]*bp[i]
114	sub	x21,x5,#16		// j=num-2
115	umulh	x7,x7,x9
116	ldp	x13,x14,[x3],#16
117	mul	x10,x8,x9		// ap[1]*bp[i]
118	adds	x6,x6,x23
119	umulh	x11,x8,x9
120	adc	x7,x7,xzr
121
122	mul	x15,x6,x4
123	sub	x20,x20,#8		// i--
124
125	// (*)	mul	x12,x13,x15	// np[0]*m1
126	umulh	x13,x13,x15
127	mul	x16,x14,x15		// np[1]*m1
128	// (*)	adds	x12,x12,x6
129	subs	xzr,x6,#1		// (*)
130	umulh	x17,x14,x15
131	cbz	x21,.Linner_skip
132
133.Linner:
134	ldr	x8,[x1],#8
135	adc	x13,x13,xzr
136	ldr	x23,[x22],#8		// tp[j]
137	adds	x6,x10,x7
138	sub	x21,x21,#8		// j--
139	adc	x7,x11,xzr
140
141	adds	x12,x16,x13
142	ldr	x14,[x3],#8
143	adc	x13,x17,xzr
144
145	mul	x10,x8,x9		// ap[j]*bp[i]
146	adds	x6,x6,x23
147	umulh	x11,x8,x9
148	adc	x7,x7,xzr
149
150	mul	x16,x14,x15		// np[j]*m1
151	adds	x12,x12,x6
152	umulh	x17,x14,x15
153	str	x12,[x22,#-16]		// tp[j-1]
154	cbnz	x21,.Linner
155
156.Linner_skip:
157	ldr	x23,[x22],#8		// tp[j]
158	adc	x13,x13,xzr
159	adds	x6,x10,x7
160	sub	x1,x1,x5		// rewind x1
161	adc	x7,x11,xzr
162
163	adds	x12,x16,x13
164	sub	x3,x3,x5		// rewind x3
165	adcs	x13,x17,x19
166	adc	x19,xzr,xzr
167
168	adds	x6,x6,x23
169	adc	x7,x7,xzr
170
171	adds	x12,x12,x6
172	adcs	x13,x13,x7
173	adc	x19,x19,xzr		// upmost overflow bit
174	stp	x12,x13,[x22,#-16]
175
176	cbnz	x20,.Louter
177
178	// Final step. We see if result is larger than modulus, and
179	// if it is, subtract the modulus. But comparison implies
180	// subtraction. So we subtract modulus, see if it borrowed,
181	// and conditionally copy original value.
182	ldr	x23,[sp]		// tp[0]
183	add	x22,sp,#8
184	ldr	x14,[x3],#8		// np[0]
185	subs	x21,x5,#8		// j=num-1 and clear borrow
186	mov	x1,x0
187.Lsub:
188	sbcs	x8,x23,x14		// tp[j]-np[j]
189	ldr	x23,[x22],#8
190	sub	x21,x21,#8		// j--
191	ldr	x14,[x3],#8
192	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
193	cbnz	x21,.Lsub
194
195	sbcs	x8,x23,x14
196	sbcs	x19,x19,xzr		// did it borrow?
197	str	x8,[x1],#8		// rp[num-1]
198
199	ldr	x23,[sp]		// tp[0]
200	add	x22,sp,#8
201	ldr	x8,[x0],#8		// rp[0]
202	sub	x5,x5,#8		// num--
203	nop
204.Lcond_copy:
205	sub	x5,x5,#8		// num--
206	csel	x14,x23,x8,lo		// did it borrow?
207	ldr	x23,[x22],#8
208	ldr	x8,[x0],#8
209	str	xzr,[x22,#-16]		// wipe tp
210	str	x14,[x0,#-16]
211	cbnz	x5,.Lcond_copy
212
213	csel	x14,x23,x8,lo
214	str	xzr,[x22,#-8]		// wipe tp
215	str	x14,[x0,#-8]
216
217	ldp	x19,x20,[x29,#16]
218	mov	sp,x29
219	ldp	x21,x22,[x29,#32]
220	mov	x0,#1
221	ldp	x23,x24,[x29,#48]
222	ldr	x29,[sp],#64
223	AARCH64_VALIDATE_LINK_REGISTER
224	ret
225.size	bn_mul_mont,.-bn_mul_mont
226.type	__bn_sqr8x_mont,%function
227.align	5
228__bn_sqr8x_mont:
229	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
230	// only from bn_mul_mont which has already signed the return address.
231	cmp	x1,x2
232	b.ne	__bn_mul4x_mont
233.Lsqr8x_mont:
234	stp	x29,x30,[sp,#-128]!
235	add	x29,sp,#0
236	stp	x19,x20,[sp,#16]
237	stp	x21,x22,[sp,#32]
238	stp	x23,x24,[sp,#48]
239	stp	x25,x26,[sp,#64]
240	stp	x27,x28,[sp,#80]
241	stp	x0,x3,[sp,#96]	// offload rp and np
242
243	ldp	x6,x7,[x1,#8*0]
244	ldp	x8,x9,[x1,#8*2]
245	ldp	x10,x11,[x1,#8*4]
246	ldp	x12,x13,[x1,#8*6]
247
248	sub	x2,sp,x5,lsl#4
249	lsl	x5,x5,#3
250	ldr	x4,[x4]		// *n0
251	mov	sp,x2			// alloca
252	sub	x27,x5,#8*8
253	b	.Lsqr8x_zero_start
254
255.Lsqr8x_zero:
256	sub	x27,x27,#8*8
257	stp	xzr,xzr,[x2,#8*0]
258	stp	xzr,xzr,[x2,#8*2]
259	stp	xzr,xzr,[x2,#8*4]
260	stp	xzr,xzr,[x2,#8*6]
261.Lsqr8x_zero_start:
262	stp	xzr,xzr,[x2,#8*8]
263	stp	xzr,xzr,[x2,#8*10]
264	stp	xzr,xzr,[x2,#8*12]
265	stp	xzr,xzr,[x2,#8*14]
266	add	x2,x2,#8*16
267	cbnz	x27,.Lsqr8x_zero
268
269	add	x3,x1,x5
270	add	x1,x1,#8*8
271	mov	x19,xzr
272	mov	x20,xzr
273	mov	x21,xzr
274	mov	x22,xzr
275	mov	x23,xzr
276	mov	x24,xzr
277	mov	x25,xzr
278	mov	x26,xzr
279	mov	x2,sp
280	str	x4,[x29,#112]		// offload n0
281
282	// Multiply everything but a[i]*a[i]
283.align	4
284.Lsqr8x_outer_loop:
285        //                                                 a[1]a[0]	(i)
286        //                                             a[2]a[0]
287        //                                         a[3]a[0]
288        //                                     a[4]a[0]
289        //                                 a[5]a[0]
290        //                             a[6]a[0]
291        //                         a[7]a[0]
292        //                                         a[2]a[1]		(ii)
293        //                                     a[3]a[1]
294        //                                 a[4]a[1]
295        //                             a[5]a[1]
296        //                         a[6]a[1]
297        //                     a[7]a[1]
298        //                                 a[3]a[2]			(iii)
299        //                             a[4]a[2]
300        //                         a[5]a[2]
301        //                     a[6]a[2]
302        //                 a[7]a[2]
303        //                         a[4]a[3]				(iv)
304        //                     a[5]a[3]
305        //                 a[6]a[3]
306        //             a[7]a[3]
307        //                 a[5]a[4]					(v)
308        //             a[6]a[4]
309        //         a[7]a[4]
310        //         a[6]a[5]						(vi)
311        //     a[7]a[5]
312        // a[7]a[6]							(vii)
313
314	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
315	mul	x15,x8,x6
316	mul	x16,x9,x6
317	mul	x17,x10,x6
318	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
319	mul	x14,x11,x6
320	adcs	x21,x21,x15
321	mul	x15,x12,x6
322	adcs	x22,x22,x16
323	mul	x16,x13,x6
324	adcs	x23,x23,x17
325	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
326	adcs	x24,x24,x14
327	umulh	x14,x8,x6
328	adcs	x25,x25,x15
329	umulh	x15,x9,x6
330	adcs	x26,x26,x16
331	umulh	x16,x10,x6
332	stp	x19,x20,[x2],#8*2	// t[0..1]
333	adc	x19,xzr,xzr		// t[8]
334	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
335	umulh	x17,x11,x6
336	adcs	x22,x22,x14
337	umulh	x14,x12,x6
338	adcs	x23,x23,x15
339	umulh	x15,x13,x6
340	adcs	x24,x24,x16
341	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
342	adcs	x25,x25,x17
343	mul	x17,x9,x7
344	adcs	x26,x26,x14
345	mul	x14,x10,x7
346	adc	x19,x19,x15
347
348	mul	x15,x11,x7
349	adds	x22,x22,x16
350	mul	x16,x12,x7
351	adcs	x23,x23,x17
352	mul	x17,x13,x7
353	adcs	x24,x24,x14
354	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
355	adcs	x25,x25,x15
356	umulh	x15,x9,x7
357	adcs	x26,x26,x16
358	umulh	x16,x10,x7
359	adcs	x19,x19,x17
360	umulh	x17,x11,x7
361	stp	x21,x22,[x2],#8*2	// t[2..3]
362	adc	x20,xzr,xzr		// t[9]
363	adds	x23,x23,x14
364	umulh	x14,x12,x7
365	adcs	x24,x24,x15
366	umulh	x15,x13,x7
367	adcs	x25,x25,x16
368	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
369	adcs	x26,x26,x17
370	mul	x17,x10,x8
371	adcs	x19,x19,x14
372	mul	x14,x11,x8
373	adc	x20,x20,x15
374
375	mul	x15,x12,x8
376	adds	x24,x24,x16
377	mul	x16,x13,x8
378	adcs	x25,x25,x17
379	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
380	adcs	x26,x26,x14
381	umulh	x14,x10,x8
382	adcs	x19,x19,x15
383	umulh	x15,x11,x8
384	adcs	x20,x20,x16
385	umulh	x16,x12,x8
386	stp	x23,x24,[x2],#8*2	// t[4..5]
387	adc	x21,xzr,xzr		// t[10]
388	adds	x25,x25,x17
389	umulh	x17,x13,x8
390	adcs	x26,x26,x14
391	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
392	adcs	x19,x19,x15
393	mul	x15,x11,x9
394	adcs	x20,x20,x16
395	mul	x16,x12,x9
396	adc	x21,x21,x17
397
398	mul	x17,x13,x9
399	adds	x26,x26,x14
400	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
401	adcs	x19,x19,x15
402	umulh	x15,x11,x9
403	adcs	x20,x20,x16
404	umulh	x16,x12,x9
405	adcs	x21,x21,x17
406	umulh	x17,x13,x9
407	stp	x25,x26,[x2],#8*2	// t[6..7]
408	adc	x22,xzr,xzr		// t[11]
409	adds	x19,x19,x14
410	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
411	adcs	x20,x20,x15
412	mul	x15,x12,x10
413	adcs	x21,x21,x16
414	mul	x16,x13,x10
415	adc	x22,x22,x17
416
417	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
418	adds	x20,x20,x14
419	umulh	x14,x12,x10
420	adcs	x21,x21,x15
421	umulh	x15,x13,x10
422	adcs	x22,x22,x16
423	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
424	adc	x23,xzr,xzr		// t[12]
425	adds	x21,x21,x17
426	mul	x17,x13,x11
427	adcs	x22,x22,x14
428	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
429	adc	x23,x23,x15
430
431	umulh	x15,x13,x11
432	adds	x22,x22,x16
433	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
434	adcs	x23,x23,x17
435	umulh	x17,x13,x12		// hi(a[7]*a[6])
436	adc	x24,xzr,xzr		// t[13]
437	adds	x23,x23,x14
438	sub	x27,x3,x1	// done yet?
439	adc	x24,x24,x15
440
441	adds	x24,x24,x16
442	sub	x14,x3,x5	// rewinded ap
443	adc	x25,xzr,xzr		// t[14]
444	add	x25,x25,x17
445
446	cbz	x27,.Lsqr8x_outer_break
447
448	mov	x4,x6
449	ldp	x6,x7,[x2,#8*0]
450	ldp	x8,x9,[x2,#8*2]
451	ldp	x10,x11,[x2,#8*4]
452	ldp	x12,x13,[x2,#8*6]
453	adds	x19,x19,x6
454	adcs	x20,x20,x7
455	ldp	x6,x7,[x1,#8*0]
456	adcs	x21,x21,x8
457	adcs	x22,x22,x9
458	ldp	x8,x9,[x1,#8*2]
459	adcs	x23,x23,x10
460	adcs	x24,x24,x11
461	ldp	x10,x11,[x1,#8*4]
462	adcs	x25,x25,x12
463	mov	x0,x1
464	adcs	x26,xzr,x13
465	ldp	x12,x13,[x1,#8*6]
466	add	x1,x1,#8*8
467	//adc	x28,xzr,xzr		// moved below
468	mov	x27,#-8*8
469
470	//                                                         a[8]a[0]
471	//                                                     a[9]a[0]
472	//                                                 a[a]a[0]
473	//                                             a[b]a[0]
474	//                                         a[c]a[0]
475	//                                     a[d]a[0]
476	//                                 a[e]a[0]
477	//                             a[f]a[0]
478	//                                                     a[8]a[1]
479	//                         a[f]a[1]........................
480	//                                                 a[8]a[2]
481	//                     a[f]a[2]........................
482	//                                             a[8]a[3]
483	//                 a[f]a[3]........................
484	//                                         a[8]a[4]
485	//             a[f]a[4]........................
486	//                                     a[8]a[5]
487	//         a[f]a[5]........................
488	//                                 a[8]a[6]
489	//     a[f]a[6]........................
490	//                             a[8]a[7]
491	// a[f]a[7]........................
492.Lsqr8x_mul:
493	mul	x14,x6,x4
494	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
495	mul	x15,x7,x4
496	add	x27,x27,#8
497	mul	x16,x8,x4
498	mul	x17,x9,x4
499	adds	x19,x19,x14
500	mul	x14,x10,x4
501	adcs	x20,x20,x15
502	mul	x15,x11,x4
503	adcs	x21,x21,x16
504	mul	x16,x12,x4
505	adcs	x22,x22,x17
506	mul	x17,x13,x4
507	adcs	x23,x23,x14
508	umulh	x14,x6,x4
509	adcs	x24,x24,x15
510	umulh	x15,x7,x4
511	adcs	x25,x25,x16
512	umulh	x16,x8,x4
513	adcs	x26,x26,x17
514	umulh	x17,x9,x4
515	adc	x28,x28,xzr
516	str	x19,[x2],#8
517	adds	x19,x20,x14
518	umulh	x14,x10,x4
519	adcs	x20,x21,x15
520	umulh	x15,x11,x4
521	adcs	x21,x22,x16
522	umulh	x16,x12,x4
523	adcs	x22,x23,x17
524	umulh	x17,x13,x4
525	ldr	x4,[x0,x27]
526	adcs	x23,x24,x14
527	adcs	x24,x25,x15
528	adcs	x25,x26,x16
529	adcs	x26,x28,x17
530	//adc	x28,xzr,xzr		// moved above
531	cbnz	x27,.Lsqr8x_mul
532					// note that carry flag is guaranteed
533					// to be zero at this point
534	cmp	x1,x3		// done yet?
535	b.eq	.Lsqr8x_break
536
537	ldp	x6,x7,[x2,#8*0]
538	ldp	x8,x9,[x2,#8*2]
539	ldp	x10,x11,[x2,#8*4]
540	ldp	x12,x13,[x2,#8*6]
541	adds	x19,x19,x6
542	ldr	x4,[x0,#-8*8]
543	adcs	x20,x20,x7
544	ldp	x6,x7,[x1,#8*0]
545	adcs	x21,x21,x8
546	adcs	x22,x22,x9
547	ldp	x8,x9,[x1,#8*2]
548	adcs	x23,x23,x10
549	adcs	x24,x24,x11
550	ldp	x10,x11,[x1,#8*4]
551	adcs	x25,x25,x12
552	mov	x27,#-8*8
553	adcs	x26,x26,x13
554	ldp	x12,x13,[x1,#8*6]
555	add	x1,x1,#8*8
556	//adc	x28,xzr,xzr		// moved above
557	b	.Lsqr8x_mul
558
559.align	4
560.Lsqr8x_break:
561	ldp	x6,x7,[x0,#8*0]
562	add	x1,x0,#8*8
563	ldp	x8,x9,[x0,#8*2]
564	sub	x14,x3,x1		// is it last iteration?
565	ldp	x10,x11,[x0,#8*4]
566	sub	x15,x2,x14
567	ldp	x12,x13,[x0,#8*6]
568	cbz	x14,.Lsqr8x_outer_loop
569
570	stp	x19,x20,[x2,#8*0]
571	ldp	x19,x20,[x15,#8*0]
572	stp	x21,x22,[x2,#8*2]
573	ldp	x21,x22,[x15,#8*2]
574	stp	x23,x24,[x2,#8*4]
575	ldp	x23,x24,[x15,#8*4]
576	stp	x25,x26,[x2,#8*6]
577	mov	x2,x15
578	ldp	x25,x26,[x15,#8*6]
579	b	.Lsqr8x_outer_loop
580
581.align	4
582.Lsqr8x_outer_break:
583	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
584	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
585	ldp	x15,x16,[sp,#8*1]
586	ldp	x11,x13,[x14,#8*2]
587	add	x1,x14,#8*4
588	ldp	x17,x14,[sp,#8*3]
589
590	stp	x19,x20,[x2,#8*0]
591	mul	x19,x7,x7
592	stp	x21,x22,[x2,#8*2]
593	umulh	x7,x7,x7
594	stp	x23,x24,[x2,#8*4]
595	mul	x8,x9,x9
596	stp	x25,x26,[x2,#8*6]
597	mov	x2,sp
598	umulh	x9,x9,x9
599	adds	x20,x7,x15,lsl#1
600	extr	x15,x16,x15,#63
601	sub	x27,x5,#8*4
602
603.Lsqr4x_shift_n_add:
604	adcs	x21,x8,x15
605	extr	x16,x17,x16,#63
606	sub	x27,x27,#8*4
607	adcs	x22,x9,x16
608	ldp	x15,x16,[x2,#8*5]
609	mul	x10,x11,x11
610	ldp	x7,x9,[x1],#8*2
611	umulh	x11,x11,x11
612	mul	x12,x13,x13
613	umulh	x13,x13,x13
614	extr	x17,x14,x17,#63
615	stp	x19,x20,[x2,#8*0]
616	adcs	x23,x10,x17
617	extr	x14,x15,x14,#63
618	stp	x21,x22,[x2,#8*2]
619	adcs	x24,x11,x14
620	ldp	x17,x14,[x2,#8*7]
621	extr	x15,x16,x15,#63
622	adcs	x25,x12,x15
623	extr	x16,x17,x16,#63
624	adcs	x26,x13,x16
625	ldp	x15,x16,[x2,#8*9]
626	mul	x6,x7,x7
627	ldp	x11,x13,[x1],#8*2
628	umulh	x7,x7,x7
629	mul	x8,x9,x9
630	umulh	x9,x9,x9
631	stp	x23,x24,[x2,#8*4]
632	extr	x17,x14,x17,#63
633	stp	x25,x26,[x2,#8*6]
634	add	x2,x2,#8*8
635	adcs	x19,x6,x17
636	extr	x14,x15,x14,#63
637	adcs	x20,x7,x14
638	ldp	x17,x14,[x2,#8*3]
639	extr	x15,x16,x15,#63
640	cbnz	x27,.Lsqr4x_shift_n_add
641	ldp	x1,x4,[x29,#104]	// pull np and n0
642
643	adcs	x21,x8,x15
644	extr	x16,x17,x16,#63
645	adcs	x22,x9,x16
646	ldp	x15,x16,[x2,#8*5]
647	mul	x10,x11,x11
648	umulh	x11,x11,x11
649	stp	x19,x20,[x2,#8*0]
650	mul	x12,x13,x13
651	umulh	x13,x13,x13
652	stp	x21,x22,[x2,#8*2]
653	extr	x17,x14,x17,#63
654	adcs	x23,x10,x17
655	extr	x14,x15,x14,#63
656	ldp	x19,x20,[sp,#8*0]
657	adcs	x24,x11,x14
658	extr	x15,x16,x15,#63
659	ldp	x6,x7,[x1,#8*0]
660	adcs	x25,x12,x15
661	extr	x16,xzr,x16,#63
662	ldp	x8,x9,[x1,#8*2]
663	adc	x26,x13,x16
664	ldp	x10,x11,[x1,#8*4]
665
666	// Reduce by 512 bits per iteration
667	mul	x28,x4,x19		// t[0]*n0
668	ldp	x12,x13,[x1,#8*6]
669	add	x3,x1,x5
670	ldp	x21,x22,[sp,#8*2]
671	stp	x23,x24,[x2,#8*4]
672	ldp	x23,x24,[sp,#8*4]
673	stp	x25,x26,[x2,#8*6]
674	ldp	x25,x26,[sp,#8*6]
675	add	x1,x1,#8*8
676	mov	x30,xzr		// initial top-most carry
677	mov	x2,sp
678	mov	x27,#8
679
680.Lsqr8x_reduction:
681	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
682	mul	x15,x7,x28
683	sub	x27,x27,#1
684	mul	x16,x8,x28
685	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
686	mul	x17,x9,x28
687	// (*)	adds	xzr,x19,x14
688	subs	xzr,x19,#1		// (*)
689	mul	x14,x10,x28
690	adcs	x19,x20,x15
691	mul	x15,x11,x28
692	adcs	x20,x21,x16
693	mul	x16,x12,x28
694	adcs	x21,x22,x17
695	mul	x17,x13,x28
696	adcs	x22,x23,x14
697	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
698	adcs	x23,x24,x15
699	umulh	x15,x7,x28
700	adcs	x24,x25,x16
701	umulh	x16,x8,x28
702	adcs	x25,x26,x17
703	umulh	x17,x9,x28
704	adc	x26,xzr,xzr
705	adds	x19,x19,x14
706	umulh	x14,x10,x28
707	adcs	x20,x20,x15
708	umulh	x15,x11,x28
709	adcs	x21,x21,x16
710	umulh	x16,x12,x28
711	adcs	x22,x22,x17
712	umulh	x17,x13,x28
713	mul	x28,x4,x19		// next t[0]*n0
714	adcs	x23,x23,x14
715	adcs	x24,x24,x15
716	adcs	x25,x25,x16
717	adc	x26,x26,x17
718	cbnz	x27,.Lsqr8x_reduction
719
720	ldp	x14,x15,[x2,#8*0]
721	ldp	x16,x17,[x2,#8*2]
722	mov	x0,x2
723	sub	x27,x3,x1	// done yet?
724	adds	x19,x19,x14
725	adcs	x20,x20,x15
726	ldp	x14,x15,[x2,#8*4]
727	adcs	x21,x21,x16
728	adcs	x22,x22,x17
729	ldp	x16,x17,[x2,#8*6]
730	adcs	x23,x23,x14
731	adcs	x24,x24,x15
732	adcs	x25,x25,x16
733	adcs	x26,x26,x17
734	//adc	x28,xzr,xzr		// moved below
735	cbz	x27,.Lsqr8x8_post_condition
736
737	ldr	x4,[x2,#-8*8]
738	ldp	x6,x7,[x1,#8*0]
739	ldp	x8,x9,[x1,#8*2]
740	ldp	x10,x11,[x1,#8*4]
741	mov	x27,#-8*8
742	ldp	x12,x13,[x1,#8*6]
743	add	x1,x1,#8*8
744
745.Lsqr8x_tail:
746	mul	x14,x6,x4
747	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
748	mul	x15,x7,x4
749	add	x27,x27,#8
750	mul	x16,x8,x4
751	mul	x17,x9,x4
752	adds	x19,x19,x14
753	mul	x14,x10,x4
754	adcs	x20,x20,x15
755	mul	x15,x11,x4
756	adcs	x21,x21,x16
757	mul	x16,x12,x4
758	adcs	x22,x22,x17
759	mul	x17,x13,x4
760	adcs	x23,x23,x14
761	umulh	x14,x6,x4
762	adcs	x24,x24,x15
763	umulh	x15,x7,x4
764	adcs	x25,x25,x16
765	umulh	x16,x8,x4
766	adcs	x26,x26,x17
767	umulh	x17,x9,x4
768	adc	x28,x28,xzr
769	str	x19,[x2],#8
770	adds	x19,x20,x14
771	umulh	x14,x10,x4
772	adcs	x20,x21,x15
773	umulh	x15,x11,x4
774	adcs	x21,x22,x16
775	umulh	x16,x12,x4
776	adcs	x22,x23,x17
777	umulh	x17,x13,x4
778	ldr	x4,[x0,x27]
779	adcs	x23,x24,x14
780	adcs	x24,x25,x15
781	adcs	x25,x26,x16
782	adcs	x26,x28,x17
783	//adc	x28,xzr,xzr		// moved above
784	cbnz	x27,.Lsqr8x_tail
785					// note that carry flag is guaranteed
786					// to be zero at this point
787	ldp	x6,x7,[x2,#8*0]
788	sub	x27,x3,x1	// done yet?
789	sub	x16,x3,x5	// rewinded np
790	ldp	x8,x9,[x2,#8*2]
791	ldp	x10,x11,[x2,#8*4]
792	ldp	x12,x13,[x2,#8*6]
793	cbz	x27,.Lsqr8x_tail_break
794
795	ldr	x4,[x0,#-8*8]
796	adds	x19,x19,x6
797	adcs	x20,x20,x7
798	ldp	x6,x7,[x1,#8*0]
799	adcs	x21,x21,x8
800	adcs	x22,x22,x9
801	ldp	x8,x9,[x1,#8*2]
802	adcs	x23,x23,x10
803	adcs	x24,x24,x11
804	ldp	x10,x11,[x1,#8*4]
805	adcs	x25,x25,x12
806	mov	x27,#-8*8
807	adcs	x26,x26,x13
808	ldp	x12,x13,[x1,#8*6]
809	add	x1,x1,#8*8
810	//adc	x28,xzr,xzr		// moved above
811	b	.Lsqr8x_tail
812
813.align	4
814.Lsqr8x_tail_break:
815	ldr	x4,[x29,#112]		// pull n0
816	add	x27,x2,#8*8		// end of current t[num] window
817
818	subs	xzr,x30,#1		// "move" top-most carry to carry bit
819	adcs	x14,x19,x6
820	adcs	x15,x20,x7
821	ldp	x19,x20,[x0,#8*0]
822	adcs	x21,x21,x8
823	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
824	adcs	x22,x22,x9
825	ldp	x8,x9,[x16,#8*2]
826	adcs	x23,x23,x10
827	adcs	x24,x24,x11
828	ldp	x10,x11,[x16,#8*4]
829	adcs	x25,x25,x12
830	adcs	x26,x26,x13
831	ldp	x12,x13,[x16,#8*6]
832	add	x1,x16,#8*8
833	adc	x30,xzr,xzr	// top-most carry
834	mul	x28,x4,x19
835	stp	x14,x15,[x2,#8*0]
836	stp	x21,x22,[x2,#8*2]
837	ldp	x21,x22,[x0,#8*2]
838	stp	x23,x24,[x2,#8*4]
839	ldp	x23,x24,[x0,#8*4]
840	cmp	x27,x29		// did we hit the bottom?
841	stp	x25,x26,[x2,#8*6]
842	mov	x2,x0			// slide the window
843	ldp	x25,x26,[x0,#8*6]
844	mov	x27,#8
845	b.ne	.Lsqr8x_reduction
846
847	// Final step. We see if result is larger than modulus, and
848	// if it is, subtract the modulus. But comparison implies
849	// subtraction. So we subtract modulus, see if it borrowed,
850	// and conditionally copy original value.
851	ldr	x0,[x29,#96]		// pull rp
852	add	x2,x2,#8*8
853	subs	x14,x19,x6
854	sbcs	x15,x20,x7
855	sub	x27,x5,#8*8
856	mov	x3,x0		// x0 copy
857
858.Lsqr8x_sub:
859	sbcs	x16,x21,x8
860	ldp	x6,x7,[x1,#8*0]
861	sbcs	x17,x22,x9
862	stp	x14,x15,[x0,#8*0]
863	sbcs	x14,x23,x10
864	ldp	x8,x9,[x1,#8*2]
865	sbcs	x15,x24,x11
866	stp	x16,x17,[x0,#8*2]
867	sbcs	x16,x25,x12
868	ldp	x10,x11,[x1,#8*4]
869	sbcs	x17,x26,x13
870	ldp	x12,x13,[x1,#8*6]
871	add	x1,x1,#8*8
872	ldp	x19,x20,[x2,#8*0]
873	sub	x27,x27,#8*8
874	ldp	x21,x22,[x2,#8*2]
875	ldp	x23,x24,[x2,#8*4]
876	ldp	x25,x26,[x2,#8*6]
877	add	x2,x2,#8*8
878	stp	x14,x15,[x0,#8*4]
879	sbcs	x14,x19,x6
880	stp	x16,x17,[x0,#8*6]
881	add	x0,x0,#8*8
882	sbcs	x15,x20,x7
883	cbnz	x27,.Lsqr8x_sub
884
885	sbcs	x16,x21,x8
886	mov	x2,sp
887	add	x1,sp,x5
888	ldp	x6,x7,[x3,#8*0]
889	sbcs	x17,x22,x9
890	stp	x14,x15,[x0,#8*0]
891	sbcs	x14,x23,x10
892	ldp	x8,x9,[x3,#8*2]
893	sbcs	x15,x24,x11
894	stp	x16,x17,[x0,#8*2]
895	sbcs	x16,x25,x12
896	ldp	x19,x20,[x1,#8*0]
897	sbcs	x17,x26,x13
898	ldp	x21,x22,[x1,#8*2]
899	sbcs	xzr,x30,xzr	// did it borrow?
900	ldr	x30,[x29,#8]		// pull return address
901	stp	x14,x15,[x0,#8*4]
902	stp	x16,x17,[x0,#8*6]
903
904	sub	x27,x5,#8*4
905.Lsqr4x_cond_copy:
906	sub	x27,x27,#8*4
907	csel	x14,x19,x6,lo
908	stp	xzr,xzr,[x2,#8*0]
909	csel	x15,x20,x7,lo
910	ldp	x6,x7,[x3,#8*4]
911	ldp	x19,x20,[x1,#8*4]
912	csel	x16,x21,x8,lo
913	stp	xzr,xzr,[x2,#8*2]
914	add	x2,x2,#8*4
915	csel	x17,x22,x9,lo
916	ldp	x8,x9,[x3,#8*6]
917	ldp	x21,x22,[x1,#8*6]
918	add	x1,x1,#8*4
919	stp	x14,x15,[x3,#8*0]
920	stp	x16,x17,[x3,#8*2]
921	add	x3,x3,#8*4
922	stp	xzr,xzr,[x1,#8*0]
923	stp	xzr,xzr,[x1,#8*2]
924	cbnz	x27,.Lsqr4x_cond_copy
925
926	csel	x14,x19,x6,lo
927	stp	xzr,xzr,[x2,#8*0]
928	csel	x15,x20,x7,lo
929	stp	xzr,xzr,[x2,#8*2]
930	csel	x16,x21,x8,lo
931	csel	x17,x22,x9,lo
932	stp	x14,x15,[x3,#8*0]
933	stp	x16,x17,[x3,#8*2]
934
935	b	.Lsqr8x_done
936
937.align	4
938.Lsqr8x8_post_condition:
939	adc	x28,xzr,xzr
940	ldr	x30,[x29,#8]		// pull return address
941	// x19-7,x28 hold result, x6-7 hold modulus
942	subs	x6,x19,x6
943	ldr	x1,[x29,#96]		// pull rp
944	sbcs	x7,x20,x7
945	stp	xzr,xzr,[sp,#8*0]
946	sbcs	x8,x21,x8
947	stp	xzr,xzr,[sp,#8*2]
948	sbcs	x9,x22,x9
949	stp	xzr,xzr,[sp,#8*4]
950	sbcs	x10,x23,x10
951	stp	xzr,xzr,[sp,#8*6]
952	sbcs	x11,x24,x11
953	stp	xzr,xzr,[sp,#8*8]
954	sbcs	x12,x25,x12
955	stp	xzr,xzr,[sp,#8*10]
956	sbcs	x13,x26,x13
957	stp	xzr,xzr,[sp,#8*12]
958	sbcs	x28,x28,xzr	// did it borrow?
959	stp	xzr,xzr,[sp,#8*14]
960
961	// x6-7 hold result-modulus
962	csel	x6,x19,x6,lo
963	csel	x7,x20,x7,lo
964	csel	x8,x21,x8,lo
965	csel	x9,x22,x9,lo
966	stp	x6,x7,[x1,#8*0]
967	csel	x10,x23,x10,lo
968	csel	x11,x24,x11,lo
969	stp	x8,x9,[x1,#8*2]
970	csel	x12,x25,x12,lo
971	csel	x13,x26,x13,lo
972	stp	x10,x11,[x1,#8*4]
973	stp	x12,x13,[x1,#8*6]
974
975.Lsqr8x_done:
976	ldp	x19,x20,[x29,#16]
977	mov	sp,x29
978	ldp	x21,x22,[x29,#32]
979	mov	x0,#1
980	ldp	x23,x24,[x29,#48]
981	ldp	x25,x26,[x29,#64]
982	ldp	x27,x28,[x29,#80]
983	ldr	x29,[sp],#128
984	// x30 is popped earlier
985	AARCH64_VALIDATE_LINK_REGISTER
986	ret
987.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
988.type	__bn_mul4x_mont,%function
989.align	5
990__bn_mul4x_mont:
991	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
992	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
993	// return address.
994	stp	x29,x30,[sp,#-128]!
995	add	x29,sp,#0
996	stp	x19,x20,[sp,#16]
997	stp	x21,x22,[sp,#32]
998	stp	x23,x24,[sp,#48]
999	stp	x25,x26,[sp,#64]
1000	stp	x27,x28,[sp,#80]
1001
1002	sub	x26,sp,x5,lsl#3
1003	lsl	x5,x5,#3
1004	ldr	x4,[x4]		// *n0
1005	sub	sp,x26,#8*4		// alloca
1006
1007	add	x10,x2,x5
1008	add	x27,x1,x5
1009	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1010
1011	ldr	x24,[x2,#8*0]		// b[0]
1012	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1013	ldp	x8,x9,[x1,#8*2]
1014	add	x1,x1,#8*4
1015	mov	x19,xzr
1016	mov	x20,xzr
1017	mov	x21,xzr
1018	mov	x22,xzr
1019	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1020	ldp	x16,x17,[x3,#8*2]
1021	adds	x3,x3,#8*4		// clear carry bit
1022	mov	x0,xzr
1023	mov	x28,#0
1024	mov	x26,sp
1025
1026.Loop_mul4x_1st_reduction:
1027	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1028	adc	x0,x0,xzr	// modulo-scheduled
1029	mul	x11,x7,x24
1030	add	x28,x28,#8
1031	mul	x12,x8,x24
1032	and	x28,x28,#31
1033	mul	x13,x9,x24
1034	adds	x19,x19,x10
1035	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1036	adcs	x20,x20,x11
1037	mul	x25,x19,x4		// t[0]*n0
1038	adcs	x21,x21,x12
1039	umulh	x11,x7,x24
1040	adcs	x22,x22,x13
1041	umulh	x12,x8,x24
1042	adc	x23,xzr,xzr
1043	umulh	x13,x9,x24
1044	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1045	adds	x20,x20,x10
1046	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1047	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1048	adcs	x21,x21,x11
1049	mul	x11,x15,x25
1050	adcs	x22,x22,x12
1051	mul	x12,x16,x25
1052	adc	x23,x23,x13		// can't overflow
1053	mul	x13,x17,x25
1054	// (*)	adds	xzr,x19,x10
1055	subs	xzr,x19,#1		// (*)
1056	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1057	adcs	x19,x20,x11
1058	umulh	x11,x15,x25
1059	adcs	x20,x21,x12
1060	umulh	x12,x16,x25
1061	adcs	x21,x22,x13
1062	umulh	x13,x17,x25
1063	adcs	x22,x23,x0
1064	adc	x0,xzr,xzr
1065	adds	x19,x19,x10
1066	sub	x10,x27,x1
1067	adcs	x20,x20,x11
1068	adcs	x21,x21,x12
1069	adcs	x22,x22,x13
1070	//adc	x0,x0,xzr
1071	cbnz	x28,.Loop_mul4x_1st_reduction
1072
1073	cbz	x10,.Lmul4x4_post_condition
1074
1075	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1076	ldp	x8,x9,[x1,#8*2]
1077	add	x1,x1,#8*4
1078	ldr	x25,[sp]		// a[0]*n0
1079	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1080	ldp	x16,x17,[x3,#8*2]
1081	add	x3,x3,#8*4
1082
1083.Loop_mul4x_1st_tail:
1084	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1085	adc	x0,x0,xzr	// modulo-scheduled
1086	mul	x11,x7,x24
1087	add	x28,x28,#8
1088	mul	x12,x8,x24
1089	and	x28,x28,#31
1090	mul	x13,x9,x24
1091	adds	x19,x19,x10
1092	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1093	adcs	x20,x20,x11
1094	umulh	x11,x7,x24
1095	adcs	x21,x21,x12
1096	umulh	x12,x8,x24
1097	adcs	x22,x22,x13
1098	umulh	x13,x9,x24
1099	adc	x23,xzr,xzr
1100	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1101	adds	x20,x20,x10
1102	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1103	adcs	x21,x21,x11
1104	mul	x11,x15,x25
1105	adcs	x22,x22,x12
1106	mul	x12,x16,x25
1107	adc	x23,x23,x13		// can't overflow
1108	mul	x13,x17,x25
1109	adds	x19,x19,x10
1110	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1111	adcs	x20,x20,x11
1112	umulh	x11,x15,x25
1113	adcs	x21,x21,x12
1114	umulh	x12,x16,x25
1115	adcs	x22,x22,x13
1116	adcs	x23,x23,x0
1117	umulh	x13,x17,x25
1118	adc	x0,xzr,xzr
1119	ldr	x25,[sp,x28]		// next t[0]*n0
1120	str	x19,[x26],#8		// result!!!
1121	adds	x19,x20,x10
1122	sub	x10,x27,x1		// done yet?
1123	adcs	x20,x21,x11
1124	adcs	x21,x22,x12
1125	adcs	x22,x23,x13
1126	//adc	x0,x0,xzr
1127	cbnz	x28,.Loop_mul4x_1st_tail
1128
1129	sub	x11,x27,x5	// rewinded x1
1130	cbz	x10,.Lmul4x_proceed
1131
1132	ldp	x6,x7,[x1,#8*0]
1133	ldp	x8,x9,[x1,#8*2]
1134	add	x1,x1,#8*4
1135	ldp	x14,x15,[x3,#8*0]
1136	ldp	x16,x17,[x3,#8*2]
1137	add	x3,x3,#8*4
1138	b	.Loop_mul4x_1st_tail
1139
1140.align	5
1141.Lmul4x_proceed:
1142	ldr	x24,[x2,#8*4]!		// *++b
1143	adc	x30,x0,xzr
1144	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1145	sub	x3,x3,x5		// rewind np
1146	ldp	x8,x9,[x11,#8*2]
1147	add	x1,x11,#8*4
1148
1149	stp	x19,x20,[x26,#8*0]	// result!!!
1150	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1151	stp	x21,x22,[x26,#8*2]	// result!!!
1152	ldp	x21,x22,[sp,#8*6]
1153
1154	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1155	mov	x26,sp
1156	ldp	x16,x17,[x3,#8*2]
1157	adds	x3,x3,#8*4		// clear carry bit
1158	mov	x0,xzr
1159
1160.align	4
1161.Loop_mul4x_reduction:
1162	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1163	adc	x0,x0,xzr	// modulo-scheduled
1164	mul	x11,x7,x24
1165	add	x28,x28,#8
1166	mul	x12,x8,x24
1167	and	x28,x28,#31
1168	mul	x13,x9,x24
1169	adds	x19,x19,x10
1170	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1171	adcs	x20,x20,x11
1172	mul	x25,x19,x4		// t[0]*n0
1173	adcs	x21,x21,x12
1174	umulh	x11,x7,x24
1175	adcs	x22,x22,x13
1176	umulh	x12,x8,x24
1177	adc	x23,xzr,xzr
1178	umulh	x13,x9,x24
1179	ldr	x24,[x2,x28]		// next b[i]
1180	adds	x20,x20,x10
1181	// (*)	mul	x10,x14,x25
1182	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1183	adcs	x21,x21,x11
1184	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1185	adcs	x22,x22,x12
1186	mul	x12,x16,x25
1187	adc	x23,x23,x13		// can't overflow
1188	mul	x13,x17,x25
1189	// (*)	adds	xzr,x19,x10
1190	subs	xzr,x19,#1		// (*)
1191	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1192	adcs	x19,x20,x11
1193	umulh	x11,x15,x25
1194	adcs	x20,x21,x12
1195	umulh	x12,x16,x25
1196	adcs	x21,x22,x13
1197	umulh	x13,x17,x25
1198	adcs	x22,x23,x0
1199	adc	x0,xzr,xzr
1200	adds	x19,x19,x10
1201	adcs	x20,x20,x11
1202	adcs	x21,x21,x12
1203	adcs	x22,x22,x13
1204	//adc	x0,x0,xzr
1205	cbnz	x28,.Loop_mul4x_reduction
1206
1207	adc	x0,x0,xzr
1208	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1209	ldp	x12,x13,[x26,#8*6]
1210	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1211	ldp	x8,x9,[x1,#8*2]
1212	add	x1,x1,#8*4
1213	adds	x19,x19,x10
1214	adcs	x20,x20,x11
1215	adcs	x21,x21,x12
1216	adcs	x22,x22,x13
1217	//adc	x0,x0,xzr
1218
1219	ldr	x25,[sp]		// t[0]*n0
1220	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1221	ldp	x16,x17,[x3,#8*2]
1222	add	x3,x3,#8*4
1223
1224.align	4
1225.Loop_mul4x_tail:
1226	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1227	adc	x0,x0,xzr	// modulo-scheduled
1228	mul	x11,x7,x24
1229	add	x28,x28,#8
1230	mul	x12,x8,x24
1231	and	x28,x28,#31
1232	mul	x13,x9,x24
1233	adds	x19,x19,x10
1234	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1235	adcs	x20,x20,x11
1236	umulh	x11,x7,x24
1237	adcs	x21,x21,x12
1238	umulh	x12,x8,x24
1239	adcs	x22,x22,x13
1240	umulh	x13,x9,x24
1241	adc	x23,xzr,xzr
1242	ldr	x24,[x2,x28]		// next b[i]
1243	adds	x20,x20,x10
1244	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1245	adcs	x21,x21,x11
1246	mul	x11,x15,x25
1247	adcs	x22,x22,x12
1248	mul	x12,x16,x25
1249	adc	x23,x23,x13		// can't overflow
1250	mul	x13,x17,x25
1251	adds	x19,x19,x10
1252	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1253	adcs	x20,x20,x11
1254	umulh	x11,x15,x25
1255	adcs	x21,x21,x12
1256	umulh	x12,x16,x25
1257	adcs	x22,x22,x13
1258	umulh	x13,x17,x25
1259	adcs	x23,x23,x0
1260	ldr	x25,[sp,x28]		// next a[0]*n0
1261	adc	x0,xzr,xzr
1262	str	x19,[x26],#8		// result!!!
1263	adds	x19,x20,x10
1264	sub	x10,x27,x1		// done yet?
1265	adcs	x20,x21,x11
1266	adcs	x21,x22,x12
1267	adcs	x22,x23,x13
1268	//adc	x0,x0,xzr
1269	cbnz	x28,.Loop_mul4x_tail
1270
1271	sub	x11,x3,x5		// rewinded np?
1272	adc	x0,x0,xzr
1273	cbz	x10,.Loop_mul4x_break
1274
1275	ldp	x10,x11,[x26,#8*4]
1276	ldp	x12,x13,[x26,#8*6]
1277	ldp	x6,x7,[x1,#8*0]
1278	ldp	x8,x9,[x1,#8*2]
1279	add	x1,x1,#8*4
1280	adds	x19,x19,x10
1281	adcs	x20,x20,x11
1282	adcs	x21,x21,x12
1283	adcs	x22,x22,x13
1284	//adc	x0,x0,xzr
1285	ldp	x14,x15,[x3,#8*0]
1286	ldp	x16,x17,[x3,#8*2]
1287	add	x3,x3,#8*4
1288	b	.Loop_mul4x_tail
1289
1290.align	4
1291.Loop_mul4x_break:
1292	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1293	adds	x19,x19,x30
1294	add	x2,x2,#8*4		// bp++
1295	adcs	x20,x20,xzr
1296	sub	x1,x1,x5		// rewind ap
1297	adcs	x21,x21,xzr
1298	stp	x19,x20,[x26,#8*0]	// result!!!
1299	adcs	x22,x22,xzr
1300	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1301	adc	x30,x0,xzr
1302	stp	x21,x22,[x26,#8*2]	// result!!!
1303	cmp	x2,x13			// done yet?
1304	ldp	x21,x22,[sp,#8*6]
1305	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1306	ldp	x16,x17,[x11,#8*2]
1307	add	x3,x11,#8*4
1308	b.eq	.Lmul4x_post
1309
1310	ldr	x24,[x2]
1311	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1312	ldp	x8,x9,[x1,#8*2]
1313	adds	x1,x1,#8*4		// clear carry bit
1314	mov	x0,xzr
1315	mov	x26,sp
1316	b	.Loop_mul4x_reduction
1317
1318.align	4
1319.Lmul4x_post:
1320	// Final step. We see if result is larger than modulus, and
1321	// if it is, subtract the modulus. But comparison implies
1322	// subtraction. So we subtract modulus, see if it borrowed,
1323	// and conditionally copy original value.
1324	mov	x0,x12
1325	mov	x27,x12		// x0 copy
1326	subs	x10,x19,x14
1327	add	x26,sp,#8*8
1328	sbcs	x11,x20,x15
1329	sub	x28,x5,#8*4
1330
1331.Lmul4x_sub:
1332	sbcs	x12,x21,x16
1333	ldp	x14,x15,[x3,#8*0]
1334	sub	x28,x28,#8*4
1335	ldp	x19,x20,[x26,#8*0]
1336	sbcs	x13,x22,x17
1337	ldp	x16,x17,[x3,#8*2]
1338	add	x3,x3,#8*4
1339	ldp	x21,x22,[x26,#8*2]
1340	add	x26,x26,#8*4
1341	stp	x10,x11,[x0,#8*0]
1342	sbcs	x10,x19,x14
1343	stp	x12,x13,[x0,#8*2]
1344	add	x0,x0,#8*4
1345	sbcs	x11,x20,x15
1346	cbnz	x28,.Lmul4x_sub
1347
1348	sbcs	x12,x21,x16
1349	mov	x26,sp
1350	add	x1,sp,#8*4
1351	ldp	x6,x7,[x27,#8*0]
1352	sbcs	x13,x22,x17
1353	stp	x10,x11,[x0,#8*0]
1354	ldp	x8,x9,[x27,#8*2]
1355	stp	x12,x13,[x0,#8*2]
1356	ldp	x19,x20,[x1,#8*0]
1357	ldp	x21,x22,[x1,#8*2]
1358	sbcs	xzr,x30,xzr	// did it borrow?
1359	ldr	x30,[x29,#8]		// pull return address
1360
1361	sub	x28,x5,#8*4
1362.Lmul4x_cond_copy:
1363	sub	x28,x28,#8*4
1364	csel	x10,x19,x6,lo
1365	stp	xzr,xzr,[x26,#8*0]
1366	csel	x11,x20,x7,lo
1367	ldp	x6,x7,[x27,#8*4]
1368	ldp	x19,x20,[x1,#8*4]
1369	csel	x12,x21,x8,lo
1370	stp	xzr,xzr,[x26,#8*2]
1371	add	x26,x26,#8*4
1372	csel	x13,x22,x9,lo
1373	ldp	x8,x9,[x27,#8*6]
1374	ldp	x21,x22,[x1,#8*6]
1375	add	x1,x1,#8*4
1376	stp	x10,x11,[x27,#8*0]
1377	stp	x12,x13,[x27,#8*2]
1378	add	x27,x27,#8*4
1379	cbnz	x28,.Lmul4x_cond_copy
1380
1381	csel	x10,x19,x6,lo
1382	stp	xzr,xzr,[x26,#8*0]
1383	csel	x11,x20,x7,lo
1384	stp	xzr,xzr,[x26,#8*2]
1385	csel	x12,x21,x8,lo
1386	stp	xzr,xzr,[x26,#8*3]
1387	csel	x13,x22,x9,lo
1388	stp	xzr,xzr,[x26,#8*4]
1389	stp	x10,x11,[x27,#8*0]
1390	stp	x12,x13,[x27,#8*2]
1391
1392	b	.Lmul4x_done
1393
1394.align	4
1395.Lmul4x4_post_condition:
1396	adc	x0,x0,xzr
1397	ldr	x1,[x29,#96]		// pull rp
1398	// x19-3,x0 hold result, x14-7 hold modulus
1399	subs	x6,x19,x14
1400	ldr	x30,[x29,#8]		// pull return address
1401	sbcs	x7,x20,x15
1402	stp	xzr,xzr,[sp,#8*0]
1403	sbcs	x8,x21,x16
1404	stp	xzr,xzr,[sp,#8*2]
1405	sbcs	x9,x22,x17
1406	stp	xzr,xzr,[sp,#8*4]
1407	sbcs	xzr,x0,xzr		// did it borrow?
1408	stp	xzr,xzr,[sp,#8*6]
1409
1410	// x6-3 hold result-modulus
1411	csel	x6,x19,x6,lo
1412	csel	x7,x20,x7,lo
1413	csel	x8,x21,x8,lo
1414	csel	x9,x22,x9,lo
1415	stp	x6,x7,[x1,#8*0]
1416	stp	x8,x9,[x1,#8*2]
1417
1418.Lmul4x_done:
1419	ldp	x19,x20,[x29,#16]
1420	mov	sp,x29
1421	ldp	x21,x22,[x29,#32]
1422	mov	x0,#1
1423	ldp	x23,x24,[x29,#48]
1424	ldp	x25,x26,[x29,#64]
1425	ldp	x27,x28,[x29,#80]
1426	ldr	x29,[sp],#128
1427	// x30 is popped earlier
1428	AARCH64_VALIDATE_LINK_REGISTER
1429	ret
1430.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1431.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1432.align	2
1433.align	4
1434#endif
1435#endif  // !OPENSSL_NO_ASM
1436.section	.note.GNU-stack,"",%progbits
1437