• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#ifndef __KERNEL__
2# include "arm_arch.h"
3.extern	OPENSSL_armcap_P
4#endif
5
6.text
7
8// forward "declarations" are required for Apple
9.globl	poly1305_blocks
10.globl	poly1305_emit
11
12.globl	poly1305_init
13.type	poly1305_init,%function
14.align	5
15poly1305_init:
16	cmp	x1,xzr
17	stp	xzr,xzr,[x0]		// zero hash value
18	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
19
20	csel	x0,xzr,x0,eq
21	b.eq	.Lno_key
22
23#ifndef	__KERNEL__
24	adrp	x17,OPENSSL_armcap_P
25	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
26#endif
27
28	ldp	x7,x8,[x1]		// load key
29	mov	x9,#0xfffffffc0fffffff
30	movk	x9,#0x0fff,lsl#48
31#ifdef	__AARCH64EB__
32	rev	x7,x7			// flip bytes
33	rev	x8,x8
34#endif
35	and	x7,x7,x9		// &=0ffffffc0fffffff
36	and	x9,x9,#-4
37	and	x8,x8,x9		// &=0ffffffc0ffffffc
38	mov	w9,#-1
39	stp	x7,x8,[x0,#32]	// save key value
40	str	w9,[x0,#48]	// impossible key power value
41
42#ifndef	__KERNEL__
43	tst	w17,#ARMV7_NEON
44
45	adr	x12,.Lpoly1305_blocks
46	adr	x7,.Lpoly1305_blocks_neon
47	adr	x13,.Lpoly1305_emit
48
49	csel	x12,x12,x7,eq
50
51# ifdef	__ILP32__
52	stp	w12,w13,[x2]
53# else
54	stp	x12,x13,[x2]
55# endif
56#endif
57	mov	x0,#1
58.Lno_key:
59	ret
60.size	poly1305_init,.-poly1305_init
61
62.type	poly1305_blocks,%function
63.align	5
64poly1305_blocks:
65.Lpoly1305_blocks:
66	ands	x2,x2,#-16
67	b.eq	.Lno_data
68
69	ldp	x4,x5,[x0]		// load hash value
70	ldp	x6,x17,[x0,#16]	// [along with is_base2_26]
71	ldp	x7,x8,[x0,#32]	// load key value
72
73#ifdef	__AARCH64EB__
74	lsr	x12,x4,#32
75	mov	w13,w4
76	lsr	x14,x5,#32
77	mov	w15,w5
78	lsr	x16,x6,#32
79#else
80	mov	w12,w4
81	lsr	x13,x4,#32
82	mov	w14,w5
83	lsr	x15,x5,#32
84	mov	w16,w6
85#endif
86
87	add	x12,x12,x13,lsl#26	// base 2^26 -> base 2^64
88	lsr	x13,x14,#12
89	adds	x12,x12,x14,lsl#52
90	add	x13,x13,x15,lsl#14
91	adc	x13,x13,xzr
92	lsr	x14,x16,#24
93	adds	x13,x13,x16,lsl#40
94	adc	x14,x14,xzr
95
96	cmp	x17,#0			// is_base2_26?
97	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
98	csel	x4,x4,x12,eq		// choose between radixes
99	csel	x5,x5,x13,eq
100	csel	x6,x6,x14,eq
101
102.Loop:
103	ldp	x10,x11,[x1],#16	// load input
104	sub	x2,x2,#16
105#ifdef	__AARCH64EB__
106	rev	x10,x10
107	rev	x11,x11
108#endif
109	adds	x4,x4,x10		// accumulate input
110	adcs	x5,x5,x11
111
112	mul	x12,x4,x7		// h0*r0
113	adc	x6,x6,x3
114	umulh	x13,x4,x7
115
116	mul	x10,x5,x9		// h1*5*r1
117	umulh	x11,x5,x9
118
119	adds	x12,x12,x10
120	mul	x10,x4,x8		// h0*r1
121	adc	x13,x13,x11
122	umulh	x14,x4,x8
123
124	adds	x13,x13,x10
125	mul	x10,x5,x7		// h1*r0
126	adc	x14,x14,xzr
127	umulh	x11,x5,x7
128
129	adds	x13,x13,x10
130	mul	x10,x6,x9		// h2*5*r1
131	adc	x14,x14,x11
132	mul	x11,x6,x7		// h2*r0
133
134	adds	x13,x13,x10
135	adc	x14,x14,x11
136
137	and	x10,x14,#-4		// final reduction
138	and	x6,x14,#3
139	add	x10,x10,x14,lsr#2
140	adds	x4,x12,x10
141	adcs	x5,x13,xzr
142	adc	x6,x6,xzr
143
144	cbnz	x2,.Loop
145
146	stp	x4,x5,[x0]		// store hash value
147	stp	x6,xzr,[x0,#16]	// [and clear is_base2_26]
148
149.Lno_data:
150	ret
151.size	poly1305_blocks,.-poly1305_blocks
152
153.type	poly1305_emit,%function
154.align	5
155poly1305_emit:
156.Lpoly1305_emit:
157	ldp	x4,x5,[x0]		// load hash base 2^64
158	ldp	x6,x7,[x0,#16]	// [along with is_base2_26]
159	ldp	x10,x11,[x2]	// load nonce
160
161#ifdef	__AARCH64EB__
162	lsr	x12,x4,#32
163	mov	w13,w4
164	lsr	x14,x5,#32
165	mov	w15,w5
166	lsr	x16,x6,#32
167#else
168	mov	w12,w4
169	lsr	x13,x4,#32
170	mov	w14,w5
171	lsr	x15,x5,#32
172	mov	w16,w6
173#endif
174
175	add	x12,x12,x13,lsl#26	// base 2^26 -> base 2^64
176	lsr	x13,x14,#12
177	adds	x12,x12,x14,lsl#52
178	add	x13,x13,x15,lsl#14
179	adc	x13,x13,xzr
180	lsr	x14,x16,#24
181	adds	x13,x13,x16,lsl#40
182	adc	x14,x14,xzr
183
184	cmp	x7,#0			// is_base2_26?
185	csel	x4,x4,x12,eq		// choose between radixes
186	csel	x5,x5,x13,eq
187	csel	x6,x6,x14,eq
188
189	adds	x12,x4,#5		// compare to modulus
190	adcs	x13,x5,xzr
191	adc	x14,x6,xzr
192
193	tst	x14,#-4			// see if it's carried/borrowed
194
195	csel	x4,x4,x12,eq
196	csel	x5,x5,x13,eq
197
198#ifdef	__AARCH64EB__
199	ror	x10,x10,#32		// flip nonce words
200	ror	x11,x11,#32
201#endif
202	adds	x4,x4,x10		// accumulate nonce
203	adc	x5,x5,x11
204#ifdef	__AARCH64EB__
205	rev	x4,x4			// flip output bytes
206	rev	x5,x5
207#endif
208	stp	x4,x5,[x1]		// write result
209
210	ret
211.size	poly1305_emit,.-poly1305_emit
212.type	poly1305_mult,%function
213.align	5
214poly1305_mult:
215	mul	x12,x4,x7		// h0*r0
216	umulh	x13,x4,x7
217
218	mul	x10,x5,x9		// h1*5*r1
219	umulh	x11,x5,x9
220
221	adds	x12,x12,x10
222	mul	x10,x4,x8		// h0*r1
223	adc	x13,x13,x11
224	umulh	x14,x4,x8
225
226	adds	x13,x13,x10
227	mul	x10,x5,x7		// h1*r0
228	adc	x14,x14,xzr
229	umulh	x11,x5,x7
230
231	adds	x13,x13,x10
232	mul	x10,x6,x9		// h2*5*r1
233	adc	x14,x14,x11
234	mul	x11,x6,x7		// h2*r0
235
236	adds	x13,x13,x10
237	adc	x14,x14,x11
238
239	and	x10,x14,#-4		// final reduction
240	and	x6,x14,#3
241	add	x10,x10,x14,lsr#2
242	adds	x4,x12,x10
243	adcs	x5,x13,xzr
244	adc	x6,x6,xzr
245
246	ret
247.size	poly1305_mult,.-poly1305_mult
248
249.type	poly1305_splat,%function
250.align	4
251poly1305_splat:
252	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
253	ubfx	x13,x4,#26,#26
254	extr	x14,x5,x4,#52
255	and	x14,x14,#0x03ffffff
256	ubfx	x15,x5,#14,#26
257	extr	x16,x6,x5,#40
258
259	str	w12,[x0,#16*0]	// r0
260	add	w12,w13,w13,lsl#2	// r1*5
261	str	w13,[x0,#16*1]	// r1
262	add	w13,w14,w14,lsl#2	// r2*5
263	str	w12,[x0,#16*2]	// s1
264	str	w14,[x0,#16*3]	// r2
265	add	w14,w15,w15,lsl#2	// r3*5
266	str	w13,[x0,#16*4]	// s2
267	str	w15,[x0,#16*5]	// r3
268	add	w15,w16,w16,lsl#2	// r4*5
269	str	w14,[x0,#16*6]	// s3
270	str	w16,[x0,#16*7]	// r4
271	str	w15,[x0,#16*8]	// s4
272
273	ret
274.size	poly1305_splat,.-poly1305_splat
275
276#ifdef	__KERNEL__
277.globl	poly1305_blocks_neon
278#endif
279.type	poly1305_blocks_neon,%function
280.align	5
281poly1305_blocks_neon:
282.Lpoly1305_blocks_neon:
283	ldr	x17,[x0,#24]
284	cmp	x2,#128
285	b.lo	.Lpoly1305_blocks
286
287	.inst	0xd503233f		// paciasp
288	stp	x29,x30,[sp,#-80]!
289	add	x29,sp,#0
290
291	stp	d8,d9,[sp,#16]		// meet ABI requirements
292	stp	d10,d11,[sp,#32]
293	stp	d12,d13,[sp,#48]
294	stp	d14,d15,[sp,#64]
295
296	cbz	x17,.Lbase2_64_neon
297
298	ldp	w10,w11,[x0]		// load hash value base 2^26
299	ldp	w12,w13,[x0,#8]
300	ldr	w14,[x0,#16]
301
302	tst	x2,#31
303	b.eq	.Leven_neon
304
305	ldp	x7,x8,[x0,#32]	// load key value
306
307	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
308	lsr	x5,x12,#12
309	adds	x4,x4,x12,lsl#52
310	add	x5,x5,x13,lsl#14
311	adc	x5,x5,xzr
312	lsr	x6,x14,#24
313	adds	x5,x5,x14,lsl#40
314	adc	x14,x6,xzr		// can be partially reduced...
315
316	ldp	x12,x13,[x1],#16	// load input
317	sub	x2,x2,#16
318	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
319
320#ifdef	__AARCH64EB__
321	rev	x12,x12
322	rev	x13,x13
323#endif
324	adds	x4,x4,x12		// accumulate input
325	adcs	x5,x5,x13
326	adc	x6,x6,x3
327
328	bl	poly1305_mult
329
330	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
331	ubfx	x11,x4,#26,#26
332	extr	x12,x5,x4,#52
333	and	x12,x12,#0x03ffffff
334	ubfx	x13,x5,#14,#26
335	extr	x14,x6,x5,#40
336
337	b	.Leven_neon
338
339.align	4
340.Lbase2_64_neon:
341	ldp	x7,x8,[x0,#32]	// load key value
342
343	ldp	x4,x5,[x0]		// load hash value base 2^64
344	ldr	x6,[x0,#16]
345
346	tst	x2,#31
347	b.eq	.Linit_neon
348
349	ldp	x12,x13,[x1],#16	// load input
350	sub	x2,x2,#16
351	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
352#ifdef	__AARCH64EB__
353	rev	x12,x12
354	rev	x13,x13
355#endif
356	adds	x4,x4,x12		// accumulate input
357	adcs	x5,x5,x13
358	adc	x6,x6,x3
359
360	bl	poly1305_mult
361
362.Linit_neon:
363	ldr	w17,[x0,#48]		// first table element
364	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
365	ubfx	x11,x4,#26,#26
366	extr	x12,x5,x4,#52
367	and	x12,x12,#0x03ffffff
368	ubfx	x13,x5,#14,#26
369	extr	x14,x6,x5,#40
370
371	cmp	w17,#-1			// is value impossible?
372	b.ne	.Leven_neon
373
374	fmov	d24,x10
375	fmov	d25,x11
376	fmov	d26,x12
377	fmov	d27,x13
378	fmov	d28,x14
379
380	////////////////////////////////// initialize r^n table
381	mov	x4,x7			// r^1
382	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
383	mov	x5,x8
384	mov	x6,xzr
385	add	x0,x0,#48+12
386	bl	poly1305_splat
387
388	bl	poly1305_mult		// r^2
389	sub	x0,x0,#4
390	bl	poly1305_splat
391
392	bl	poly1305_mult		// r^3
393	sub	x0,x0,#4
394	bl	poly1305_splat
395
396	bl	poly1305_mult		// r^4
397	sub	x0,x0,#4
398	bl	poly1305_splat
399	sub	x0,x0,#48		// restore original x0
400	b	.Ldo_neon
401
402.align	4
403.Leven_neon:
404	fmov	d24,x10
405	fmov	d25,x11
406	fmov	d26,x12
407	fmov	d27,x13
408	fmov	d28,x14
409
410.Ldo_neon:
411	ldp	x8,x12,[x1,#32]	// inp[2:3]
412	subs	x2,x2,#64
413	ldp	x9,x13,[x1,#48]
414	add	x16,x1,#96
415	adr	x17,.Lzeros
416
417	lsl	x3,x3,#24
418	add	x15,x0,#48
419
420#ifdef	__AARCH64EB__
421	rev	x8,x8
422	rev	x12,x12
423	rev	x9,x9
424	rev	x13,x13
425#endif
426	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
427	and	x5,x9,#0x03ffffff
428	ubfx	x6,x8,#26,#26
429	ubfx	x7,x9,#26,#26
430	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
431	extr	x8,x12,x8,#52
432	extr	x9,x13,x9,#52
433	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
434	fmov	d14,x4
435	and	x8,x8,#0x03ffffff
436	and	x9,x9,#0x03ffffff
437	ubfx	x10,x12,#14,#26
438	ubfx	x11,x13,#14,#26
439	add	x12,x3,x12,lsr#40
440	add	x13,x3,x13,lsr#40
441	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
442	fmov	d15,x6
443	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
444	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
445	fmov	d16,x8
446	fmov	d17,x10
447	fmov	d18,x12
448
449	ldp	x8,x12,[x1],#16	// inp[0:1]
450	ldp	x9,x13,[x1],#48
451
452	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
453	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
454	ld1	{v8.4s},[x15]
455
456#ifdef	__AARCH64EB__
457	rev	x8,x8
458	rev	x12,x12
459	rev	x9,x9
460	rev	x13,x13
461#endif
462	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
463	and	x5,x9,#0x03ffffff
464	ubfx	x6,x8,#26,#26
465	ubfx	x7,x9,#26,#26
466	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
467	extr	x8,x12,x8,#52
468	extr	x9,x13,x9,#52
469	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
470	fmov	d9,x4
471	and	x8,x8,#0x03ffffff
472	and	x9,x9,#0x03ffffff
473	ubfx	x10,x12,#14,#26
474	ubfx	x11,x13,#14,#26
475	add	x12,x3,x12,lsr#40
476	add	x13,x3,x13,lsr#40
477	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
478	fmov	d10,x6
479	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
480	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
481	movi	v31.2d,#-1
482	fmov	d11,x8
483	fmov	d12,x10
484	fmov	d13,x12
485	ushr	v31.2d,v31.2d,#38
486
487	b.ls	.Lskip_loop
488
489.align	4
490.Loop_neon:
491	////////////////////////////////////////////////////////////////
492	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
493	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
494	//   ___________________/
495	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
496	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
497	//   ___________________/ ____________________/
498	//
499	// Note that we start with inp[2:3]*r^2. This is because it
500	// doesn't depend on reduction in previous iteration.
501	////////////////////////////////////////////////////////////////
502	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
503	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
504	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
505	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
506	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
507
508	subs	x2,x2,#64
509	umull	v23.2d,v14.2s,v7.s[2]
510	csel	x16,x17,x16,lo
511	umull	v22.2d,v14.2s,v5.s[2]
512	umull	v21.2d,v14.2s,v3.s[2]
513	 ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
514	umull	v20.2d,v14.2s,v1.s[2]
515	 ldp	x9,x13,[x16],#48
516	umull	v19.2d,v14.2s,v0.s[2]
517#ifdef	__AARCH64EB__
518	 rev	x8,x8
519	 rev	x12,x12
520	 rev	x9,x9
521	 rev	x13,x13
522#endif
523
524	umlal	v23.2d,v15.2s,v5.s[2]
525	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
526	umlal	v22.2d,v15.2s,v3.s[2]
527	 and	x5,x9,#0x03ffffff
528	umlal	v21.2d,v15.2s,v1.s[2]
529	 ubfx	x6,x8,#26,#26
530	umlal	v20.2d,v15.2s,v0.s[2]
531	 ubfx	x7,x9,#26,#26
532	umlal	v19.2d,v15.2s,v8.s[2]
533	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
534
535	umlal	v23.2d,v16.2s,v3.s[2]
536	 extr	x8,x12,x8,#52
537	umlal	v22.2d,v16.2s,v1.s[2]
538	 extr	x9,x13,x9,#52
539	umlal	v21.2d,v16.2s,v0.s[2]
540	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
541	umlal	v20.2d,v16.2s,v8.s[2]
542	 fmov	d14,x4
543	umlal	v19.2d,v16.2s,v6.s[2]
544	 and	x8,x8,#0x03ffffff
545
546	umlal	v23.2d,v17.2s,v1.s[2]
547	 and	x9,x9,#0x03ffffff
548	umlal	v22.2d,v17.2s,v0.s[2]
549	 ubfx	x10,x12,#14,#26
550	umlal	v21.2d,v17.2s,v8.s[2]
551	 ubfx	x11,x13,#14,#26
552	umlal	v20.2d,v17.2s,v6.s[2]
553	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
554	umlal	v19.2d,v17.2s,v4.s[2]
555	 fmov	d15,x6
556
557	add	v11.2s,v11.2s,v26.2s
558	 add	x12,x3,x12,lsr#40
559	umlal	v23.2d,v18.2s,v0.s[2]
560	 add	x13,x3,x13,lsr#40
561	umlal	v22.2d,v18.2s,v8.s[2]
562	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
563	umlal	v21.2d,v18.2s,v6.s[2]
564	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
565	umlal	v20.2d,v18.2s,v4.s[2]
566	 fmov	d16,x8
567	umlal	v19.2d,v18.2s,v2.s[2]
568	 fmov	d17,x10
569
570	////////////////////////////////////////////////////////////////
571	// (hash+inp[0:1])*r^4 and accumulate
572
573	add	v9.2s,v9.2s,v24.2s
574	 fmov	d18,x12
575	umlal	v22.2d,v11.2s,v1.s[0]
576	 ldp	x8,x12,[x1],#16	// inp[0:1]
577	umlal	v19.2d,v11.2s,v6.s[0]
578	 ldp	x9,x13,[x1],#48
579	umlal	v23.2d,v11.2s,v3.s[0]
580	umlal	v20.2d,v11.2s,v8.s[0]
581	umlal	v21.2d,v11.2s,v0.s[0]
582#ifdef	__AARCH64EB__
583	 rev	x8,x8
584	 rev	x12,x12
585	 rev	x9,x9
586	 rev	x13,x13
587#endif
588
589	add	v10.2s,v10.2s,v25.2s
590	umlal	v22.2d,v9.2s,v5.s[0]
591	umlal	v23.2d,v9.2s,v7.s[0]
592	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
593	umlal	v21.2d,v9.2s,v3.s[0]
594	 and	x5,x9,#0x03ffffff
595	umlal	v19.2d,v9.2s,v0.s[0]
596	 ubfx	x6,x8,#26,#26
597	umlal	v20.2d,v9.2s,v1.s[0]
598	 ubfx	x7,x9,#26,#26
599
600	add	v12.2s,v12.2s,v27.2s
601	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
602	umlal	v22.2d,v10.2s,v3.s[0]
603	 extr	x8,x12,x8,#52
604	umlal	v23.2d,v10.2s,v5.s[0]
605	 extr	x9,x13,x9,#52
606	umlal	v19.2d,v10.2s,v8.s[0]
607	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
608	umlal	v21.2d,v10.2s,v1.s[0]
609	 fmov	d9,x4
610	umlal	v20.2d,v10.2s,v0.s[0]
611	 and	x8,x8,#0x03ffffff
612
613	add	v13.2s,v13.2s,v28.2s
614	 and	x9,x9,#0x03ffffff
615	umlal	v22.2d,v12.2s,v0.s[0]
616	 ubfx	x10,x12,#14,#26
617	umlal	v19.2d,v12.2s,v4.s[0]
618	 ubfx	x11,x13,#14,#26
619	umlal	v23.2d,v12.2s,v1.s[0]
620	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
621	umlal	v20.2d,v12.2s,v6.s[0]
622	 fmov	d10,x6
623	umlal	v21.2d,v12.2s,v8.s[0]
624	 add	x12,x3,x12,lsr#40
625
626	umlal	v22.2d,v13.2s,v8.s[0]
627	 add	x13,x3,x13,lsr#40
628	umlal	v19.2d,v13.2s,v2.s[0]
629	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
630	umlal	v23.2d,v13.2s,v0.s[0]
631	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
632	umlal	v20.2d,v13.2s,v4.s[0]
633	 fmov	d11,x8
634	umlal	v21.2d,v13.2s,v6.s[0]
635	 fmov	d12,x10
636	 fmov	d13,x12
637
638	/////////////////////////////////////////////////////////////////
639	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
640	// and P. Schwabe
641	//
642	// [see discussion in poly1305-armv4 module]
643
644	ushr	v29.2d,v22.2d,#26
645	xtn	v27.2s,v22.2d
646	 ushr	v30.2d,v19.2d,#26
647	 and	v19.16b,v19.16b,v31.16b
648	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
649	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
650	 add	v20.2d,v20.2d,v30.2d	// h0 -> h1
651
652	ushr	v29.2d,v23.2d,#26
653	xtn	v28.2s,v23.2d
654	 ushr	v30.2d,v20.2d,#26
655	 xtn	v25.2s,v20.2d
656	bic	v28.2s,#0xfc,lsl#24
657	 add	v21.2d,v21.2d,v30.2d	// h1 -> h2
658
659	add	v19.2d,v19.2d,v29.2d
660	shl	v29.2d,v29.2d,#2
661	 shrn	v30.2s,v21.2d,#26
662	 xtn	v26.2s,v21.2d
663	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
664	 bic	v25.2s,#0xfc,lsl#24
665	 add	v27.2s,v27.2s,v30.2s		// h2 -> h3
666	 bic	v26.2s,#0xfc,lsl#24
667
668	shrn	v29.2s,v19.2d,#26
669	xtn	v24.2s,v19.2d
670	 ushr	v30.2s,v27.2s,#26
671	 bic	v27.2s,#0xfc,lsl#24
672	 bic	v24.2s,#0xfc,lsl#24
673	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
674	 add	v28.2s,v28.2s,v30.2s		// h3 -> h4
675
676	b.hi	.Loop_neon
677
678.Lskip_loop:
679	dup	v16.2d,v16.d[0]
680	add	v11.2s,v11.2s,v26.2s
681
682	////////////////////////////////////////////////////////////////
683	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
684
685	adds	x2,x2,#32
686	b.ne	.Long_tail
687
688	dup	v16.2d,v11.d[0]
689	add	v14.2s,v9.2s,v24.2s
690	add	v17.2s,v12.2s,v27.2s
691	add	v15.2s,v10.2s,v25.2s
692	add	v18.2s,v13.2s,v28.2s
693
694.Long_tail:
695	dup	v14.2d,v14.d[0]
696	umull2	v19.2d,v16.4s,v6.4s
697	umull2	v22.2d,v16.4s,v1.4s
698	umull2	v23.2d,v16.4s,v3.4s
699	umull2	v21.2d,v16.4s,v0.4s
700	umull2	v20.2d,v16.4s,v8.4s
701
702	dup	v15.2d,v15.d[0]
703	umlal2	v19.2d,v14.4s,v0.4s
704	umlal2	v21.2d,v14.4s,v3.4s
705	umlal2	v22.2d,v14.4s,v5.4s
706	umlal2	v23.2d,v14.4s,v7.4s
707	umlal2	v20.2d,v14.4s,v1.4s
708
709	dup	v17.2d,v17.d[0]
710	umlal2	v19.2d,v15.4s,v8.4s
711	umlal2	v22.2d,v15.4s,v3.4s
712	umlal2	v21.2d,v15.4s,v1.4s
713	umlal2	v23.2d,v15.4s,v5.4s
714	umlal2	v20.2d,v15.4s,v0.4s
715
716	dup	v18.2d,v18.d[0]
717	umlal2	v22.2d,v17.4s,v0.4s
718	umlal2	v23.2d,v17.4s,v1.4s
719	umlal2	v19.2d,v17.4s,v4.4s
720	umlal2	v20.2d,v17.4s,v6.4s
721	umlal2	v21.2d,v17.4s,v8.4s
722
723	umlal2	v22.2d,v18.4s,v8.4s
724	umlal2	v19.2d,v18.4s,v2.4s
725	umlal2	v23.2d,v18.4s,v0.4s
726	umlal2	v20.2d,v18.4s,v4.4s
727	umlal2	v21.2d,v18.4s,v6.4s
728
729	b.eq	.Lshort_tail
730
731	////////////////////////////////////////////////////////////////
732	// (hash+inp[0:1])*r^4:r^3 and accumulate
733
734	add	v9.2s,v9.2s,v24.2s
735	umlal	v22.2d,v11.2s,v1.2s
736	umlal	v19.2d,v11.2s,v6.2s
737	umlal	v23.2d,v11.2s,v3.2s
738	umlal	v20.2d,v11.2s,v8.2s
739	umlal	v21.2d,v11.2s,v0.2s
740
741	add	v10.2s,v10.2s,v25.2s
742	umlal	v22.2d,v9.2s,v5.2s
743	umlal	v19.2d,v9.2s,v0.2s
744	umlal	v23.2d,v9.2s,v7.2s
745	umlal	v20.2d,v9.2s,v1.2s
746	umlal	v21.2d,v9.2s,v3.2s
747
748	add	v12.2s,v12.2s,v27.2s
749	umlal	v22.2d,v10.2s,v3.2s
750	umlal	v19.2d,v10.2s,v8.2s
751	umlal	v23.2d,v10.2s,v5.2s
752	umlal	v20.2d,v10.2s,v0.2s
753	umlal	v21.2d,v10.2s,v1.2s
754
755	add	v13.2s,v13.2s,v28.2s
756	umlal	v22.2d,v12.2s,v0.2s
757	umlal	v19.2d,v12.2s,v4.2s
758	umlal	v23.2d,v12.2s,v1.2s
759	umlal	v20.2d,v12.2s,v6.2s
760	umlal	v21.2d,v12.2s,v8.2s
761
762	umlal	v22.2d,v13.2s,v8.2s
763	umlal	v19.2d,v13.2s,v2.2s
764	umlal	v23.2d,v13.2s,v0.2s
765	umlal	v20.2d,v13.2s,v4.2s
766	umlal	v21.2d,v13.2s,v6.2s
767
768.Lshort_tail:
769	////////////////////////////////////////////////////////////////
770	// horizontal add
771
772	addp	v22.2d,v22.2d,v22.2d
773	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
774	addp	v19.2d,v19.2d,v19.2d
775	 ldp	d10,d11,[sp,#32]
776	addp	v23.2d,v23.2d,v23.2d
777	 ldp	d12,d13,[sp,#48]
778	addp	v20.2d,v20.2d,v20.2d
779	 ldp	d14,d15,[sp,#64]
780	addp	v21.2d,v21.2d,v21.2d
781	 ldr	x30,[sp,#8]
782
783	////////////////////////////////////////////////////////////////
784	// lazy reduction, but without narrowing
785
786	ushr	v29.2d,v22.2d,#26
787	and	v22.16b,v22.16b,v31.16b
788	 ushr	v30.2d,v19.2d,#26
789	 and	v19.16b,v19.16b,v31.16b
790
791	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
792	 add	v20.2d,v20.2d,v30.2d	// h0 -> h1
793
794	ushr	v29.2d,v23.2d,#26
795	and	v23.16b,v23.16b,v31.16b
796	 ushr	v30.2d,v20.2d,#26
797	 and	v20.16b,v20.16b,v31.16b
798	 add	v21.2d,v21.2d,v30.2d	// h1 -> h2
799
800	add	v19.2d,v19.2d,v29.2d
801	shl	v29.2d,v29.2d,#2
802	 ushr	v30.2d,v21.2d,#26
803	 and	v21.16b,v21.16b,v31.16b
804	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
805	 add	v22.2d,v22.2d,v30.2d	// h2 -> h3
806
807	ushr	v29.2d,v19.2d,#26
808	and	v19.16b,v19.16b,v31.16b
809	 ushr	v30.2d,v22.2d,#26
810	 and	v22.16b,v22.16b,v31.16b
811	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
812	 add	v23.2d,v23.2d,v30.2d	// h3 -> h4
813
814	////////////////////////////////////////////////////////////////
815	// write the result, can be partially reduced
816
817	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
818	mov	x4,#1
819	st1	{v23.s}[0],[x0]
820	str	x4,[x0,#8]		// set is_base2_26
821
822	ldr	x29,[sp],#80
823	 .inst	0xd50323bf		// autiasp
824	ret
825.size	poly1305_blocks_neon,.-poly1305_blocks_neon
826
827.align	5
828.Lzeros:
829.long	0,0,0,0,0,0,0,0
830.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
831.align	2
832#if !defined(__KERNEL__) && !defined(_WIN64)
833.comm	OPENSSL_armcap_P,4,4
834.hidden	OPENSSL_armcap_P
835#endif
836