• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3.text
4
5// forward "declarations" are required for Apple
6
7.hidden	OPENSSL_armcap_P
8.globl	poly1305_init
9.hidden	poly1305_init
10.globl	poly1305_blocks
11.hidden	poly1305_blocks
12.globl	poly1305_emit
13.hidden	poly1305_emit
14
15.type	poly1305_init,%function
16.align	5
17poly1305_init:
18	cmp	x1,xzr
19	stp	xzr,xzr,[x0]		// zero hash value
20	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
21
22	csel	x0,xzr,x0,eq
23	b.eq	.Lno_key
24
25	adrp	x17,OPENSSL_armcap_P
26	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
27
28	ldp	x7,x8,[x1]		// load key
29	mov	x9,#0xfffffffc0fffffff
30	movk	x9,#0x0fff,lsl#48
31#ifdef	__ARMEB__
32	rev	x7,x7			// flip bytes
33	rev	x8,x8
34#endif
35	and	x7,x7,x9		// &=0ffffffc0fffffff
36	and	x9,x9,#-4
37	and	x8,x8,x9		// &=0ffffffc0ffffffc
38	stp	x7,x8,[x0,#32]	// save key value
39
40	tst	w17,#ARMV7_NEON
41
42	adr	x12,.Lpoly1305_blocks
43	adr	x7,.Lpoly1305_blocks_neon
44	adr	x13,.Lpoly1305_emit
45	adr	x8,.Lpoly1305_emit_neon
46
47	csel	x12,x12,x7,eq
48	csel	x13,x13,x8,eq
49
50#ifdef	__ILP32__
51	stp	w12,w13,[x2]
52#else
53	stp	x12,x13,[x2]
54#endif
55
56	mov	x0,#1
57.Lno_key:
58	ret
59.size	poly1305_init,.-poly1305_init
60
61.type	poly1305_blocks,%function
62.align	5
63poly1305_blocks:
64.Lpoly1305_blocks:
65	ands	x2,x2,#-16
66	b.eq	.Lno_data
67
68	ldp	x4,x5,[x0]		// load hash value
69	ldp	x7,x8,[x0,#32]	// load key value
70	ldr	x6,[x0,#16]
71	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
72	b	.Loop
73
74.align	5
75.Loop:
76	ldp	x10,x11,[x1],#16	// load input
77	sub	x2,x2,#16
78#ifdef	__ARMEB__
79	rev	x10,x10
80	rev	x11,x11
81#endif
82	adds	x4,x4,x10		// accumulate input
83	adcs	x5,x5,x11
84
85	mul	x12,x4,x7		// h0*r0
86	adc	x6,x6,x3
87	umulh	x13,x4,x7
88
89	mul	x10,x5,x9		// h1*5*r1
90	umulh	x11,x5,x9
91
92	adds	x12,x12,x10
93	mul	x10,x4,x8		// h0*r1
94	adc	x13,x13,x11
95	umulh	x14,x4,x8
96
97	adds	x13,x13,x10
98	mul	x10,x5,x7		// h1*r0
99	adc	x14,x14,xzr
100	umulh	x11,x5,x7
101
102	adds	x13,x13,x10
103	mul	x10,x6,x9		// h2*5*r1
104	adc	x14,x14,x11
105	mul	x11,x6,x7		// h2*r0
106
107	adds	x13,x13,x10
108	adc	x14,x14,x11
109
110	and	x10,x14,#-4		// final reduction
111	and	x6,x14,#3
112	add	x10,x10,x14,lsr#2
113	adds	x4,x12,x10
114	adcs	x5,x13,xzr
115	adc	x6,x6,xzr
116
117	cbnz	x2,.Loop
118
119	stp	x4,x5,[x0]		// store hash value
120	str	x6,[x0,#16]
121
122.Lno_data:
123	ret
124.size	poly1305_blocks,.-poly1305_blocks
125
126.type	poly1305_emit,%function
127.align	5
128poly1305_emit:
129.Lpoly1305_emit:
130	ldp	x4,x5,[x0]		// load hash base 2^64
131	ldr	x6,[x0,#16]
132	ldp	x10,x11,[x2]	// load nonce
133
134	adds	x12,x4,#5		// compare to modulus
135	adcs	x13,x5,xzr
136	adc	x14,x6,xzr
137
138	tst	x14,#-4			// see if it's carried/borrowed
139
140	csel	x4,x4,x12,eq
141	csel	x5,x5,x13,eq
142
143#ifdef	__ARMEB__
144	ror	x10,x10,#32		// flip nonce words
145	ror	x11,x11,#32
146#endif
147	adds	x4,x4,x10		// accumulate nonce
148	adc	x5,x5,x11
149#ifdef	__ARMEB__
150	rev	x4,x4			// flip output bytes
151	rev	x5,x5
152#endif
153	stp	x4,x5,[x1]		// write result
154
155	ret
156.size	poly1305_emit,.-poly1305_emit
157.type	poly1305_mult,%function
158.align	5
159poly1305_mult:
160	mul	x12,x4,x7		// h0*r0
161	umulh	x13,x4,x7
162
163	mul	x10,x5,x9		// h1*5*r1
164	umulh	x11,x5,x9
165
166	adds	x12,x12,x10
167	mul	x10,x4,x8		// h0*r1
168	adc	x13,x13,x11
169	umulh	x14,x4,x8
170
171	adds	x13,x13,x10
172	mul	x10,x5,x7		// h1*r0
173	adc	x14,x14,xzr
174	umulh	x11,x5,x7
175
176	adds	x13,x13,x10
177	mul	x10,x6,x9		// h2*5*r1
178	adc	x14,x14,x11
179	mul	x11,x6,x7		// h2*r0
180
181	adds	x13,x13,x10
182	adc	x14,x14,x11
183
184	and	x10,x14,#-4		// final reduction
185	and	x6,x14,#3
186	add	x10,x10,x14,lsr#2
187	adds	x4,x12,x10
188	adcs	x5,x13,xzr
189	adc	x6,x6,xzr
190
191	ret
192.size	poly1305_mult,.-poly1305_mult
193
194.type	poly1305_splat,%function
195.align	5
196poly1305_splat:
197	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
198	ubfx	x13,x4,#26,#26
199	extr	x14,x5,x4,#52
200	and	x14,x14,#0x03ffffff
201	ubfx	x15,x5,#14,#26
202	extr	x16,x6,x5,#40
203
204	str	w12,[x0,#16*0]	// r0
205	add	w12,w13,w13,lsl#2	// r1*5
206	str	w13,[x0,#16*1]	// r1
207	add	w13,w14,w14,lsl#2	// r2*5
208	str	w12,[x0,#16*2]	// s1
209	str	w14,[x0,#16*3]	// r2
210	add	w14,w15,w15,lsl#2	// r3*5
211	str	w13,[x0,#16*4]	// s2
212	str	w15,[x0,#16*5]	// r3
213	add	w15,w16,w16,lsl#2	// r4*5
214	str	w14,[x0,#16*6]	// s3
215	str	w16,[x0,#16*7]	// r4
216	str	w15,[x0,#16*8]	// s4
217
218	ret
219.size	poly1305_splat,.-poly1305_splat
220
221.type	poly1305_blocks_neon,%function
222.align	5
223poly1305_blocks_neon:
224.Lpoly1305_blocks_neon:
225	ldr	x17,[x0,#24]
226	cmp	x2,#128
227	b.hs	.Lblocks_neon
228	cbz	x17,.Lpoly1305_blocks
229
230.Lblocks_neon:
231.inst	0xd503233f		// paciasp
232	stp	x29,x30,[sp,#-80]!
233	add	x29,sp,#0
234
235	ands	x2,x2,#-16
236	b.eq	.Lno_data_neon
237
238	cbz	x17,.Lbase2_64_neon
239
240	ldp	w10,w11,[x0]		// load hash value base 2^26
241	ldp	w12,w13,[x0,#8]
242	ldr	w14,[x0,#16]
243
244	tst	x2,#31
245	b.eq	.Leven_neon
246
247	ldp	x7,x8,[x0,#32]	// load key value
248
249	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
250	lsr	x5,x12,#12
251	adds	x4,x4,x12,lsl#52
252	add	x5,x5,x13,lsl#14
253	adc	x5,x5,xzr
254	lsr	x6,x14,#24
255	adds	x5,x5,x14,lsl#40
256	adc	x14,x6,xzr		// can be partially reduced...
257
258	ldp	x12,x13,[x1],#16	// load input
259	sub	x2,x2,#16
260	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
261
262	and	x10,x14,#-4		// ... so reduce
263	and	x6,x14,#3
264	add	x10,x10,x14,lsr#2
265	adds	x4,x4,x10
266	adcs	x5,x5,xzr
267	adc	x6,x6,xzr
268
269#ifdef	__ARMEB__
270	rev	x12,x12
271	rev	x13,x13
272#endif
273	adds	x4,x4,x12		// accumulate input
274	adcs	x5,x5,x13
275	adc	x6,x6,x3
276
277	bl	poly1305_mult
278	ldr	x30,[sp,#8]
279
280	cbz	x3,.Lstore_base2_64_neon
281
282	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
283	ubfx	x11,x4,#26,#26
284	extr	x12,x5,x4,#52
285	and	x12,x12,#0x03ffffff
286	ubfx	x13,x5,#14,#26
287	extr	x14,x6,x5,#40
288
289	cbnz	x2,.Leven_neon
290
291	stp	w10,w11,[x0]		// store hash value base 2^26
292	stp	w12,w13,[x0,#8]
293	str	w14,[x0,#16]
294	b	.Lno_data_neon
295
296.align	4
297.Lstore_base2_64_neon:
298	stp	x4,x5,[x0]		// store hash value base 2^64
299	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
300	b	.Lno_data_neon
301
302.align	4
303.Lbase2_64_neon:
304	ldp	x7,x8,[x0,#32]	// load key value
305
306	ldp	x4,x5,[x0]		// load hash value base 2^64
307	ldr	x6,[x0,#16]
308
309	tst	x2,#31
310	b.eq	.Linit_neon
311
312	ldp	x12,x13,[x1],#16	// load input
313	sub	x2,x2,#16
314	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
315#ifdef	__ARMEB__
316	rev	x12,x12
317	rev	x13,x13
318#endif
319	adds	x4,x4,x12		// accumulate input
320	adcs	x5,x5,x13
321	adc	x6,x6,x3
322
323	bl	poly1305_mult
324
325.Linit_neon:
326	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
327	ubfx	x11,x4,#26,#26
328	extr	x12,x5,x4,#52
329	and	x12,x12,#0x03ffffff
330	ubfx	x13,x5,#14,#26
331	extr	x14,x6,x5,#40
332
333	stp	d8,d9,[sp,#16]		// meet ABI requirements
334	stp	d10,d11,[sp,#32]
335	stp	d12,d13,[sp,#48]
336	stp	d14,d15,[sp,#64]
337
338	fmov	d24,x10
339	fmov	d25,x11
340	fmov	d26,x12
341	fmov	d27,x13
342	fmov	d28,x14
343
344	////////////////////////////////// initialize r^n table
345	mov	x4,x7			// r^1
346	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
347	mov	x5,x8
348	mov	x6,xzr
349	add	x0,x0,#48+12
350	bl	poly1305_splat
351
352	bl	poly1305_mult		// r^2
353	sub	x0,x0,#4
354	bl	poly1305_splat
355
356	bl	poly1305_mult		// r^3
357	sub	x0,x0,#4
358	bl	poly1305_splat
359
360	bl	poly1305_mult		// r^4
361	sub	x0,x0,#4
362	bl	poly1305_splat
363	ldr	x30,[sp,#8]
364
365	add	x16,x1,#32
366	adr	x17,.Lzeros
367	subs	x2,x2,#64
368	csel	x16,x17,x16,lo
369
370	mov	x4,#1
371	stur	x4,[x0,#-24]		// set is_base2_26
372	sub	x0,x0,#48		// restore original x0
373	b	.Ldo_neon
374
375.align	4
376.Leven_neon:
377	add	x16,x1,#32
378	adr	x17,.Lzeros
379	subs	x2,x2,#64
380	csel	x16,x17,x16,lo
381
382	stp	d8,d9,[sp,#16]		// meet ABI requirements
383	stp	d10,d11,[sp,#32]
384	stp	d12,d13,[sp,#48]
385	stp	d14,d15,[sp,#64]
386
387	fmov	d24,x10
388	fmov	d25,x11
389	fmov	d26,x12
390	fmov	d27,x13
391	fmov	d28,x14
392
393.Ldo_neon:
394	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
395	ldp	x9,x13,[x16],#48
396
397	lsl	x3,x3,#24
398	add	x15,x0,#48
399
400#ifdef	__ARMEB__
401	rev	x8,x8
402	rev	x12,x12
403	rev	x9,x9
404	rev	x13,x13
405#endif
406	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
407	and	x5,x9,#0x03ffffff
408	ubfx	x6,x8,#26,#26
409	ubfx	x7,x9,#26,#26
410	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
411	extr	x8,x12,x8,#52
412	extr	x9,x13,x9,#52
413	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
414	fmov	d14,x4
415	and	x8,x8,#0x03ffffff
416	and	x9,x9,#0x03ffffff
417	ubfx	x10,x12,#14,#26
418	ubfx	x11,x13,#14,#26
419	add	x12,x3,x12,lsr#40
420	add	x13,x3,x13,lsr#40
421	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
422	fmov	d15,x6
423	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
424	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
425	fmov	d16,x8
426	fmov	d17,x10
427	fmov	d18,x12
428
429	ldp	x8,x12,[x1],#16	// inp[0:1]
430	ldp	x9,x13,[x1],#48
431
432	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
433	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
434	ld1	{v8.4s},[x15]
435
436#ifdef	__ARMEB__
437	rev	x8,x8
438	rev	x12,x12
439	rev	x9,x9
440	rev	x13,x13
441#endif
442	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
443	and	x5,x9,#0x03ffffff
444	ubfx	x6,x8,#26,#26
445	ubfx	x7,x9,#26,#26
446	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
447	extr	x8,x12,x8,#52
448	extr	x9,x13,x9,#52
449	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
450	fmov	d9,x4
451	and	x8,x8,#0x03ffffff
452	and	x9,x9,#0x03ffffff
453	ubfx	x10,x12,#14,#26
454	ubfx	x11,x13,#14,#26
455	add	x12,x3,x12,lsr#40
456	add	x13,x3,x13,lsr#40
457	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
458	fmov	d10,x6
459	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
460	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
461	movi	v31.2d,#-1
462	fmov	d11,x8
463	fmov	d12,x10
464	fmov	d13,x12
465	ushr	v31.2d,v31.2d,#38
466
467	b.ls	.Lskip_loop
468
469.align	4
470.Loop_neon:
471	////////////////////////////////////////////////////////////////
472	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
473	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
474	//   ___________________/
475	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
476	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
477	//   ___________________/ ____________________/
478	//
479	// Note that we start with inp[2:3]*r^2. This is because it
480	// doesn't depend on reduction in previous iteration.
481	////////////////////////////////////////////////////////////////
482	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
483	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
484	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
485	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
486	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
487
488	subs	x2,x2,#64
489	umull	v23.2d,v14.2s,v7.s[2]
490	csel	x16,x17,x16,lo
491	umull	v22.2d,v14.2s,v5.s[2]
492	umull	v21.2d,v14.2s,v3.s[2]
493	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
494	umull	v20.2d,v14.2s,v1.s[2]
495	ldp	x9,x13,[x16],#48
496	umull	v19.2d,v14.2s,v0.s[2]
497#ifdef	__ARMEB__
498	rev	x8,x8
499	rev	x12,x12
500	rev	x9,x9
501	rev	x13,x13
502#endif
503
504	umlal	v23.2d,v15.2s,v5.s[2]
505	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
506	umlal	v22.2d,v15.2s,v3.s[2]
507	and	x5,x9,#0x03ffffff
508	umlal	v21.2d,v15.2s,v1.s[2]
509	ubfx	x6,x8,#26,#26
510	umlal	v20.2d,v15.2s,v0.s[2]
511	ubfx	x7,x9,#26,#26
512	umlal	v19.2d,v15.2s,v8.s[2]
513	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
514
515	umlal	v23.2d,v16.2s,v3.s[2]
516	extr	x8,x12,x8,#52
517	umlal	v22.2d,v16.2s,v1.s[2]
518	extr	x9,x13,x9,#52
519	umlal	v21.2d,v16.2s,v0.s[2]
520	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
521	umlal	v20.2d,v16.2s,v8.s[2]
522	fmov	d14,x4
523	umlal	v19.2d,v16.2s,v6.s[2]
524	and	x8,x8,#0x03ffffff
525
526	umlal	v23.2d,v17.2s,v1.s[2]
527	and	x9,x9,#0x03ffffff
528	umlal	v22.2d,v17.2s,v0.s[2]
529	ubfx	x10,x12,#14,#26
530	umlal	v21.2d,v17.2s,v8.s[2]
531	ubfx	x11,x13,#14,#26
532	umlal	v20.2d,v17.2s,v6.s[2]
533	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
534	umlal	v19.2d,v17.2s,v4.s[2]
535	fmov	d15,x6
536
537	add	v11.2s,v11.2s,v26.2s
538	add	x12,x3,x12,lsr#40
539	umlal	v23.2d,v18.2s,v0.s[2]
540	add	x13,x3,x13,lsr#40
541	umlal	v22.2d,v18.2s,v8.s[2]
542	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
543	umlal	v21.2d,v18.2s,v6.s[2]
544	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
545	umlal	v20.2d,v18.2s,v4.s[2]
546	fmov	d16,x8
547	umlal	v19.2d,v18.2s,v2.s[2]
548	fmov	d17,x10
549
550	////////////////////////////////////////////////////////////////
551	// (hash+inp[0:1])*r^4 and accumulate
552
553	add	v9.2s,v9.2s,v24.2s
554	fmov	d18,x12
555	umlal	v22.2d,v11.2s,v1.s[0]
556	ldp	x8,x12,[x1],#16	// inp[0:1]
557	umlal	v19.2d,v11.2s,v6.s[0]
558	ldp	x9,x13,[x1],#48
559	umlal	v23.2d,v11.2s,v3.s[0]
560	umlal	v20.2d,v11.2s,v8.s[0]
561	umlal	v21.2d,v11.2s,v0.s[0]
562#ifdef	__ARMEB__
563	rev	x8,x8
564	rev	x12,x12
565	rev	x9,x9
566	rev	x13,x13
567#endif
568
569	add	v10.2s,v10.2s,v25.2s
570	umlal	v22.2d,v9.2s,v5.s[0]
571	umlal	v23.2d,v9.2s,v7.s[0]
572	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
573	umlal	v21.2d,v9.2s,v3.s[0]
574	and	x5,x9,#0x03ffffff
575	umlal	v19.2d,v9.2s,v0.s[0]
576	ubfx	x6,x8,#26,#26
577	umlal	v20.2d,v9.2s,v1.s[0]
578	ubfx	x7,x9,#26,#26
579
580	add	v12.2s,v12.2s,v27.2s
581	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
582	umlal	v22.2d,v10.2s,v3.s[0]
583	extr	x8,x12,x8,#52
584	umlal	v23.2d,v10.2s,v5.s[0]
585	extr	x9,x13,x9,#52
586	umlal	v19.2d,v10.2s,v8.s[0]
587	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
588	umlal	v21.2d,v10.2s,v1.s[0]
589	fmov	d9,x4
590	umlal	v20.2d,v10.2s,v0.s[0]
591	and	x8,x8,#0x03ffffff
592
593	add	v13.2s,v13.2s,v28.2s
594	and	x9,x9,#0x03ffffff
595	umlal	v22.2d,v12.2s,v0.s[0]
596	ubfx	x10,x12,#14,#26
597	umlal	v19.2d,v12.2s,v4.s[0]
598	ubfx	x11,x13,#14,#26
599	umlal	v23.2d,v12.2s,v1.s[0]
600	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
601	umlal	v20.2d,v12.2s,v6.s[0]
602	fmov	d10,x6
603	umlal	v21.2d,v12.2s,v8.s[0]
604	add	x12,x3,x12,lsr#40
605
606	umlal	v22.2d,v13.2s,v8.s[0]
607	add	x13,x3,x13,lsr#40
608	umlal	v19.2d,v13.2s,v2.s[0]
609	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
610	umlal	v23.2d,v13.2s,v0.s[0]
611	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
612	umlal	v20.2d,v13.2s,v4.s[0]
613	fmov	d11,x8
614	umlal	v21.2d,v13.2s,v6.s[0]
615	fmov	d12,x10
616	fmov	d13,x12
617
618	/////////////////////////////////////////////////////////////////
619	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
620	// and P. Schwabe
621	//
622	// [see discussion in poly1305-armv4 module]
623
624	ushr	v29.2d,v22.2d,#26
625	xtn	v27.2s,v22.2d
626	ushr	v30.2d,v19.2d,#26
627	and	v19.16b,v19.16b,v31.16b
628	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
629	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
630	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
631
632	ushr	v29.2d,v23.2d,#26
633	xtn	v28.2s,v23.2d
634	ushr	v30.2d,v20.2d,#26
635	xtn	v25.2s,v20.2d
636	bic	v28.2s,#0xfc,lsl#24
637	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
638
639	add	v19.2d,v19.2d,v29.2d
640	shl	v29.2d,v29.2d,#2
641	shrn	v30.2s,v21.2d,#26
642	xtn	v26.2s,v21.2d
643	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
644	bic	v25.2s,#0xfc,lsl#24
645	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
646	bic	v26.2s,#0xfc,lsl#24
647
648	shrn	v29.2s,v19.2d,#26
649	xtn	v24.2s,v19.2d
650	ushr	v30.2s,v27.2s,#26
651	bic	v27.2s,#0xfc,lsl#24
652	bic	v24.2s,#0xfc,lsl#24
653	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
654	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
655
656	b.hi	.Loop_neon
657
658.Lskip_loop:
659	dup	v16.2d,v16.d[0]
660	add	v11.2s,v11.2s,v26.2s
661
662	////////////////////////////////////////////////////////////////
663	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
664
665	adds	x2,x2,#32
666	b.ne	.Long_tail
667
668	dup	v16.2d,v11.d[0]
669	add	v14.2s,v9.2s,v24.2s
670	add	v17.2s,v12.2s,v27.2s
671	add	v15.2s,v10.2s,v25.2s
672	add	v18.2s,v13.2s,v28.2s
673
674.Long_tail:
675	dup	v14.2d,v14.d[0]
676	umull2	v19.2d,v16.4s,v6.4s
677	umull2	v22.2d,v16.4s,v1.4s
678	umull2	v23.2d,v16.4s,v3.4s
679	umull2	v21.2d,v16.4s,v0.4s
680	umull2	v20.2d,v16.4s,v8.4s
681
682	dup	v15.2d,v15.d[0]
683	umlal2	v19.2d,v14.4s,v0.4s
684	umlal2	v21.2d,v14.4s,v3.4s
685	umlal2	v22.2d,v14.4s,v5.4s
686	umlal2	v23.2d,v14.4s,v7.4s
687	umlal2	v20.2d,v14.4s,v1.4s
688
689	dup	v17.2d,v17.d[0]
690	umlal2	v19.2d,v15.4s,v8.4s
691	umlal2	v22.2d,v15.4s,v3.4s
692	umlal2	v21.2d,v15.4s,v1.4s
693	umlal2	v23.2d,v15.4s,v5.4s
694	umlal2	v20.2d,v15.4s,v0.4s
695
696	dup	v18.2d,v18.d[0]
697	umlal2	v22.2d,v17.4s,v0.4s
698	umlal2	v23.2d,v17.4s,v1.4s
699	umlal2	v19.2d,v17.4s,v4.4s
700	umlal2	v20.2d,v17.4s,v6.4s
701	umlal2	v21.2d,v17.4s,v8.4s
702
703	umlal2	v22.2d,v18.4s,v8.4s
704	umlal2	v19.2d,v18.4s,v2.4s
705	umlal2	v23.2d,v18.4s,v0.4s
706	umlal2	v20.2d,v18.4s,v4.4s
707	umlal2	v21.2d,v18.4s,v6.4s
708
709	b.eq	.Lshort_tail
710
711	////////////////////////////////////////////////////////////////
712	// (hash+inp[0:1])*r^4:r^3 and accumulate
713
714	add	v9.2s,v9.2s,v24.2s
715	umlal	v22.2d,v11.2s,v1.2s
716	umlal	v19.2d,v11.2s,v6.2s
717	umlal	v23.2d,v11.2s,v3.2s
718	umlal	v20.2d,v11.2s,v8.2s
719	umlal	v21.2d,v11.2s,v0.2s
720
721	add	v10.2s,v10.2s,v25.2s
722	umlal	v22.2d,v9.2s,v5.2s
723	umlal	v19.2d,v9.2s,v0.2s
724	umlal	v23.2d,v9.2s,v7.2s
725	umlal	v20.2d,v9.2s,v1.2s
726	umlal	v21.2d,v9.2s,v3.2s
727
728	add	v12.2s,v12.2s,v27.2s
729	umlal	v22.2d,v10.2s,v3.2s
730	umlal	v19.2d,v10.2s,v8.2s
731	umlal	v23.2d,v10.2s,v5.2s
732	umlal	v20.2d,v10.2s,v0.2s
733	umlal	v21.2d,v10.2s,v1.2s
734
735	add	v13.2s,v13.2s,v28.2s
736	umlal	v22.2d,v12.2s,v0.2s
737	umlal	v19.2d,v12.2s,v4.2s
738	umlal	v23.2d,v12.2s,v1.2s
739	umlal	v20.2d,v12.2s,v6.2s
740	umlal	v21.2d,v12.2s,v8.2s
741
742	umlal	v22.2d,v13.2s,v8.2s
743	umlal	v19.2d,v13.2s,v2.2s
744	umlal	v23.2d,v13.2s,v0.2s
745	umlal	v20.2d,v13.2s,v4.2s
746	umlal	v21.2d,v13.2s,v6.2s
747
748.Lshort_tail:
749	////////////////////////////////////////////////////////////////
750	// horizontal add
751
752	addp	v22.2d,v22.2d,v22.2d
753	ldp	d8,d9,[sp,#16]		// meet ABI requirements
754	addp	v19.2d,v19.2d,v19.2d
755	ldp	d10,d11,[sp,#32]
756	addp	v23.2d,v23.2d,v23.2d
757	ldp	d12,d13,[sp,#48]
758	addp	v20.2d,v20.2d,v20.2d
759	ldp	d14,d15,[sp,#64]
760	addp	v21.2d,v21.2d,v21.2d
761
762	////////////////////////////////////////////////////////////////
763	// lazy reduction, but without narrowing
764
765	ushr	v29.2d,v22.2d,#26
766	and	v22.16b,v22.16b,v31.16b
767	ushr	v30.2d,v19.2d,#26
768	and	v19.16b,v19.16b,v31.16b
769
770	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
771	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
772
773	ushr	v29.2d,v23.2d,#26
774	and	v23.16b,v23.16b,v31.16b
775	ushr	v30.2d,v20.2d,#26
776	and	v20.16b,v20.16b,v31.16b
777	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
778
779	add	v19.2d,v19.2d,v29.2d
780	shl	v29.2d,v29.2d,#2
781	ushr	v30.2d,v21.2d,#26
782	and	v21.16b,v21.16b,v31.16b
783	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
784	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
785
786	ushr	v29.2d,v19.2d,#26
787	and	v19.16b,v19.16b,v31.16b
788	ushr	v30.2d,v22.2d,#26
789	and	v22.16b,v22.16b,v31.16b
790	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
791	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
792
793	////////////////////////////////////////////////////////////////
794	// write the result, can be partially reduced
795
796	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
797	st1	{v23.s}[0],[x0]
798
799.Lno_data_neon:
800	ldr	x29,[sp],#80
801.inst	0xd50323bf		// autiasp
802	ret
803.size	poly1305_blocks_neon,.-poly1305_blocks_neon
804
805.type	poly1305_emit_neon,%function
806.align	5
807poly1305_emit_neon:
808.Lpoly1305_emit_neon:
809	ldr	x17,[x0,#24]
810	cbz	x17,poly1305_emit
811
812	ldp	w10,w11,[x0]		// load hash value base 2^26
813	ldp	w12,w13,[x0,#8]
814	ldr	w14,[x0,#16]
815
816	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
817	lsr	x5,x12,#12
818	adds	x4,x4,x12,lsl#52
819	add	x5,x5,x13,lsl#14
820	adc	x5,x5,xzr
821	lsr	x6,x14,#24
822	adds	x5,x5,x14,lsl#40
823	adc	x6,x6,xzr		// can be partially reduced...
824
825	ldp	x10,x11,[x2]	// load nonce
826
827	and	x12,x6,#-4		// ... so reduce
828	add	x12,x12,x6,lsr#2
829	and	x6,x6,#3
830	adds	x4,x4,x12
831	adcs	x5,x5,xzr
832	adc	x6,x6,xzr
833
834	adds	x12,x4,#5		// compare to modulus
835	adcs	x13,x5,xzr
836	adc	x14,x6,xzr
837
838	tst	x14,#-4			// see if it's carried/borrowed
839
840	csel	x4,x4,x12,eq
841	csel	x5,x5,x13,eq
842
843#ifdef	__ARMEB__
844	ror	x10,x10,#32		// flip nonce words
845	ror	x11,x11,#32
846#endif
847	adds	x4,x4,x10		// accumulate nonce
848	adc	x5,x5,x11
849#ifdef	__ARMEB__
850	rev	x4,x4			// flip output bytes
851	rev	x5,x5
852#endif
853	stp	x4,x5,[x1]		// write result
854
855	ret
856.size	poly1305_emit_neon,.-poly1305_emit_neon
857
858.align	5
859.Lzeros:
860.long	0,0,0,0,0,0,0,0
861.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
862.align	2
863.align	2
864