• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3#if defined(__thumb2__)
4.syntax	unified
5.thumb
6#else
7.code	32
8#endif
9
10.text
11
12.globl	poly1305_emit
13.globl	poly1305_blocks
14.globl	poly1305_init
15.type	poly1305_init,%function
16.align	5
17poly1305_init:
18.Lpoly1305_init:
19	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
20
21	eor	r3,r3,r3
22	cmp	r1,#0
23	str	r3,[r0,#0]		@ zero hash value
24	str	r3,[r0,#4]
25	str	r3,[r0,#8]
26	str	r3,[r0,#12]
27	str	r3,[r0,#16]
28	str	r3,[r0,#36]		@ is_base2_26
29	add	r0,r0,#20
30
31#ifdef	__thumb2__
32	it	eq
33#endif
34	moveq	r0,#0
35	beq	.Lno_key
36
37#if	__ARM_MAX_ARCH__>=7
38	adr	r11,.Lpoly1305_init
39	ldr	r12,.LOPENSSL_armcap
40#endif
41	ldrb	r4,[r1,#0]
42	mov	r10,#0x0fffffff
43	ldrb	r5,[r1,#1]
44	and	r3,r10,#-4		@ 0x0ffffffc
45	ldrb	r6,[r1,#2]
46	ldrb	r7,[r1,#3]
47	orr	r4,r4,r5,lsl#8
48	ldrb	r5,[r1,#4]
49	orr	r4,r4,r6,lsl#16
50	ldrb	r6,[r1,#5]
51	orr	r4,r4,r7,lsl#24
52	ldrb	r7,[r1,#6]
53	and	r4,r4,r10
54
55#if	__ARM_MAX_ARCH__>=7
56# if !defined(_WIN32)
57	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
58# endif
59# if defined(__APPLE__) || defined(_WIN32)
60	ldr	r12,[r12]
61# endif
62#endif
63	ldrb	r8,[r1,#7]
64	orr	r5,r5,r6,lsl#8
65	ldrb	r6,[r1,#8]
66	orr	r5,r5,r7,lsl#16
67	ldrb	r7,[r1,#9]
68	orr	r5,r5,r8,lsl#24
69	ldrb	r8,[r1,#10]
70	and	r5,r5,r3
71
72#if	__ARM_MAX_ARCH__>=7
73	tst	r12,#ARMV7_NEON		@ check for NEON
74# ifdef	__thumb2__
75	adr	r9,.Lpoly1305_blocks_neon
76	adr	r11,.Lpoly1305_blocks
77	adr	r12,.Lpoly1305_emit
78	adr	r10,.Lpoly1305_emit_neon
79	itt	ne
80	movne	r11,r9
81	movne	r12,r10
82	orr	r11,r11,#1	@ thumb-ify address
83	orr	r12,r12,#1
84# else
85	addeq	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
86	addne	r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
87	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
88	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
89# endif
90#endif
91	ldrb	r9,[r1,#11]
92	orr	r6,r6,r7,lsl#8
93	ldrb	r7,[r1,#12]
94	orr	r6,r6,r8,lsl#16
95	ldrb	r8,[r1,#13]
96	orr	r6,r6,r9,lsl#24
97	ldrb	r9,[r1,#14]
98	and	r6,r6,r3
99
100	ldrb	r10,[r1,#15]
101	orr	r7,r7,r8,lsl#8
102	str	r4,[r0,#0]
103	orr	r7,r7,r9,lsl#16
104	str	r5,[r0,#4]
105	orr	r7,r7,r10,lsl#24
106	str	r6,[r0,#8]
107	and	r7,r7,r3
108	str	r7,[r0,#12]
109#if	__ARM_MAX_ARCH__>=7
110	stmia	r2,{r11,r12}		@ fill functions table
111	mov	r0,#1
112#else
113	mov	r0,#0
114#endif
115.Lno_key:
116	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
117#if	__ARM_ARCH__>=5
118	bx	lr				@ bx	lr
119#else
120	tst	lr,#1
121	moveq	pc,lr			@ be binary compatible with V4, yet
122.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
123#endif
124.size	poly1305_init,.-poly1305_init
125.type	poly1305_blocks,%function
126.align	5
127poly1305_blocks:
128.Lpoly1305_blocks:
129	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
130
131	ands	r2,r2,#-16
132	beq	.Lno_data
133
134	cmp	r3,#0
135	add	r2,r2,r1		@ end pointer
136	sub	sp,sp,#32
137
138	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
139
140	str	r0,[sp,#12]		@ offload stuff
141	mov	lr,r1
142	str	r2,[sp,#16]
143	str	r10,[sp,#20]
144	str	r11,[sp,#24]
145	str	r12,[sp,#28]
146	b	.Loop
147
148.Loop:
149#if __ARM_ARCH__<7
150	ldrb	r0,[lr],#16		@ load input
151# ifdef	__thumb2__
152	it	hi
153# endif
154	addhi	r8,r8,#1		@ 1<<128
155	ldrb	r1,[lr,#-15]
156	ldrb	r2,[lr,#-14]
157	ldrb	r3,[lr,#-13]
158	orr	r1,r0,r1,lsl#8
159	ldrb	r0,[lr,#-12]
160	orr	r2,r1,r2,lsl#16
161	ldrb	r1,[lr,#-11]
162	orr	r3,r2,r3,lsl#24
163	ldrb	r2,[lr,#-10]
164	adds	r4,r4,r3		@ accumulate input
165
166	ldrb	r3,[lr,#-9]
167	orr	r1,r0,r1,lsl#8
168	ldrb	r0,[lr,#-8]
169	orr	r2,r1,r2,lsl#16
170	ldrb	r1,[lr,#-7]
171	orr	r3,r2,r3,lsl#24
172	ldrb	r2,[lr,#-6]
173	adcs	r5,r5,r3
174
175	ldrb	r3,[lr,#-5]
176	orr	r1,r0,r1,lsl#8
177	ldrb	r0,[lr,#-4]
178	orr	r2,r1,r2,lsl#16
179	ldrb	r1,[lr,#-3]
180	orr	r3,r2,r3,lsl#24
181	ldrb	r2,[lr,#-2]
182	adcs	r6,r6,r3
183
184	ldrb	r3,[lr,#-1]
185	orr	r1,r0,r1,lsl#8
186	str	lr,[sp,#8]		@ offload input pointer
187	orr	r2,r1,r2,lsl#16
188	add	r10,r10,r10,lsr#2
189	orr	r3,r2,r3,lsl#24
190#else
191	ldr	r0,[lr],#16		@ load input
192# ifdef	__thumb2__
193	it	hi
194# endif
195	addhi	r8,r8,#1		@ padbit
196	ldr	r1,[lr,#-12]
197	ldr	r2,[lr,#-8]
198	ldr	r3,[lr,#-4]
199# ifdef	__ARMEB__
200	rev	r0,r0
201	rev	r1,r1
202	rev	r2,r2
203	rev	r3,r3
204# endif
205	adds	r4,r4,r0		@ accumulate input
206	str	lr,[sp,#8]		@ offload input pointer
207	adcs	r5,r5,r1
208	add	r10,r10,r10,lsr#2
209	adcs	r6,r6,r2
210#endif
211	add	r11,r11,r11,lsr#2
212	adcs	r7,r7,r3
213	add	r12,r12,r12,lsr#2
214
215	umull	r2,r3,r5,r9
216	adc	r8,r8,#0
217	umull	r0,r1,r4,r9
218	umlal	r2,r3,r8,r10
219	umlal	r0,r1,r7,r10
220	ldr	r10,[sp,#20]		@ reload r10
221	umlal	r2,r3,r6,r12
222	umlal	r0,r1,r5,r12
223	umlal	r2,r3,r7,r11
224	umlal	r0,r1,r6,r11
225	umlal	r2,r3,r4,r10
226	str	r0,[sp,#0]		@ future r4
227	mul	r0,r11,r8
228	ldr	r11,[sp,#24]		@ reload r11
229	adds	r2,r2,r1		@ d1+=d0>>32
230	eor	r1,r1,r1
231	adc	lr,r3,#0		@ future r6
232	str	r2,[sp,#4]		@ future r5
233
234	mul	r2,r12,r8
235	eor	r3,r3,r3
236	umlal	r0,r1,r7,r12
237	ldr	r12,[sp,#28]		@ reload r12
238	umlal	r2,r3,r7,r9
239	umlal	r0,r1,r6,r9
240	umlal	r2,r3,r6,r10
241	umlal	r0,r1,r5,r10
242	umlal	r2,r3,r5,r11
243	umlal	r0,r1,r4,r11
244	umlal	r2,r3,r4,r12
245	ldr	r4,[sp,#0]
246	mul	r8,r9,r8
247	ldr	r5,[sp,#4]
248
249	adds	r6,lr,r0		@ d2+=d1>>32
250	ldr	lr,[sp,#8]		@ reload input pointer
251	adc	r1,r1,#0
252	adds	r7,r2,r1		@ d3+=d2>>32
253	ldr	r0,[sp,#16]		@ reload end pointer
254	adc	r3,r3,#0
255	add	r8,r8,r3		@ h4+=d3>>32
256
257	and	r1,r8,#-4
258	and	r8,r8,#3
259	add	r1,r1,r1,lsr#2		@ *=5
260	adds	r4,r4,r1
261	adcs	r5,r5,#0
262	adcs	r6,r6,#0
263	adcs	r7,r7,#0
264	adc	r8,r8,#0
265
266	cmp	r0,lr			@ done yet?
267	bhi	.Loop
268
269	ldr	r0,[sp,#12]
270	add	sp,sp,#32
271	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
272
273.Lno_data:
274#if	__ARM_ARCH__>=5
275	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
276#else
277	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
278	tst	lr,#1
279	moveq	pc,lr			@ be binary compatible with V4, yet
280.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
281#endif
282.size	poly1305_blocks,.-poly1305_blocks
283.type	poly1305_emit,%function
284.align	5
285poly1305_emit:
286.Lpoly1305_emit:
287	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
288.Lpoly1305_emit_enter:
289
290	ldmia	r0,{r3,r4,r5,r6,r7}
291	adds	r8,r3,#5		@ compare to modulus
292	adcs	r9,r4,#0
293	adcs	r10,r5,#0
294	adcs	r11,r6,#0
295	adc	r7,r7,#0
296	tst	r7,#4			@ did it carry/borrow?
297
298#ifdef	__thumb2__
299	it	ne
300#endif
301	movne	r3,r8
302	ldr	r8,[r2,#0]
303#ifdef	__thumb2__
304	it	ne
305#endif
306	movne	r4,r9
307	ldr	r9,[r2,#4]
308#ifdef	__thumb2__
309	it	ne
310#endif
311	movne	r5,r10
312	ldr	r10,[r2,#8]
313#ifdef	__thumb2__
314	it	ne
315#endif
316	movne	r6,r11
317	ldr	r11,[r2,#12]
318
319	adds	r3,r3,r8
320	adcs	r4,r4,r9
321	adcs	r5,r5,r10
322	adc	r6,r6,r11
323
324#if __ARM_ARCH__>=7
325# ifdef __ARMEB__
326	rev	r3,r3
327	rev	r4,r4
328	rev	r5,r5
329	rev	r6,r6
330# endif
331	str	r3,[r1,#0]
332	str	r4,[r1,#4]
333	str	r5,[r1,#8]
334	str	r6,[r1,#12]
335#else
336	strb	r3,[r1,#0]
337	mov	r3,r3,lsr#8
338	strb	r4,[r1,#4]
339	mov	r4,r4,lsr#8
340	strb	r5,[r1,#8]
341	mov	r5,r5,lsr#8
342	strb	r6,[r1,#12]
343	mov	r6,r6,lsr#8
344
345	strb	r3,[r1,#1]
346	mov	r3,r3,lsr#8
347	strb	r4,[r1,#5]
348	mov	r4,r4,lsr#8
349	strb	r5,[r1,#9]
350	mov	r5,r5,lsr#8
351	strb	r6,[r1,#13]
352	mov	r6,r6,lsr#8
353
354	strb	r3,[r1,#2]
355	mov	r3,r3,lsr#8
356	strb	r4,[r1,#6]
357	mov	r4,r4,lsr#8
358	strb	r5,[r1,#10]
359	mov	r5,r5,lsr#8
360	strb	r6,[r1,#14]
361	mov	r6,r6,lsr#8
362
363	strb	r3,[r1,#3]
364	strb	r4,[r1,#7]
365	strb	r5,[r1,#11]
366	strb	r6,[r1,#15]
367#endif
368	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
369#if	__ARM_ARCH__>=5
370	bx	lr				@ bx	lr
371#else
372	tst	lr,#1
373	moveq	pc,lr			@ be binary compatible with V4, yet
374.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
375#endif
376.size	poly1305_emit,.-poly1305_emit
377#if	__ARM_MAX_ARCH__>=7
378.fpu	neon
379
380.type	poly1305_init_neon,%function
381.align	5
382poly1305_init_neon:
383	ldr	r4,[r0,#20]		@ load key base 2^32
384	ldr	r5,[r0,#24]
385	ldr	r6,[r0,#28]
386	ldr	r7,[r0,#32]
387
388	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
389	mov	r3,r4,lsr#26
390	mov	r4,r5,lsr#20
391	orr	r3,r3,r5,lsl#6
392	mov	r5,r6,lsr#14
393	orr	r4,r4,r6,lsl#12
394	mov	r6,r7,lsr#8
395	orr	r5,r5,r7,lsl#18
396	and	r3,r3,#0x03ffffff
397	and	r4,r4,#0x03ffffff
398	and	r5,r5,#0x03ffffff
399
400	vdup.32	d0,r2			@ r^1 in both lanes
401	add	r2,r3,r3,lsl#2		@ *5
402	vdup.32	d1,r3
403	add	r3,r4,r4,lsl#2
404	vdup.32	d2,r2
405	vdup.32	d3,r4
406	add	r4,r5,r5,lsl#2
407	vdup.32	d4,r3
408	vdup.32	d5,r5
409	add	r5,r6,r6,lsl#2
410	vdup.32	d6,r4
411	vdup.32	d7,r6
412	vdup.32	d8,r5
413
414	mov	r5,#2		@ counter
415
416.Lsquare_neon:
417	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
418	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
419	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
420	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
421	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
422	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
423
424	vmull.u32	q5,d0,d0[1]
425	vmull.u32	q6,d1,d0[1]
426	vmull.u32	q7,d3,d0[1]
427	vmull.u32	q8,d5,d0[1]
428	vmull.u32	q9,d7,d0[1]
429
430	vmlal.u32	q5,d7,d2[1]
431	vmlal.u32	q6,d0,d1[1]
432	vmlal.u32	q7,d1,d1[1]
433	vmlal.u32	q8,d3,d1[1]
434	vmlal.u32	q9,d5,d1[1]
435
436	vmlal.u32	q5,d5,d4[1]
437	vmlal.u32	q6,d7,d4[1]
438	vmlal.u32	q8,d1,d3[1]
439	vmlal.u32	q7,d0,d3[1]
440	vmlal.u32	q9,d3,d3[1]
441
442	vmlal.u32	q5,d3,d6[1]
443	vmlal.u32	q8,d0,d5[1]
444	vmlal.u32	q6,d5,d6[1]
445	vmlal.u32	q7,d7,d6[1]
446	vmlal.u32	q9,d1,d5[1]
447
448	vmlal.u32	q8,d7,d8[1]
449	vmlal.u32	q5,d1,d8[1]
450	vmlal.u32	q6,d3,d8[1]
451	vmlal.u32	q7,d5,d8[1]
452	vmlal.u32	q9,d0,d7[1]
453
454	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
455	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
456	@ and P. Schwabe
457	@
458	@ H0>>+H1>>+H2>>+H3>>+H4
459	@ H3>>+H4>>*5+H0>>+H1
460	@
461	@ Trivia.
462	@
463	@ Result of multiplication of n-bit number by m-bit number is
464	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
465	@ m-bit number multiplied by 2^n is still n+m bits wide.
466	@
467	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
468	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
469	@ one is n+1 bits wide.
470	@
471	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
472	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
473	@ can be 27. However! In cases when their width exceeds 26 bits
474	@ they are limited by 2^26+2^6. This in turn means that *sum*
475	@ of the products with these values can still be viewed as sum
476	@ of 52-bit numbers as long as the amount of addends is not a
477	@ power of 2. For example,
478	@
479	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
480	@
481	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
482	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
483	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
484	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
485	@ which is less than 32 * (2^52) or 2^57. And when processing
486	@ data we are looking at triple as many addends...
487	@
488	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
489	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
490	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
491	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
492	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
493	@ This means that result of reduction have to be compressed upon
494	@ loop wrap-around. This can be done in the process of reduction
495	@ to minimize amount of instructions [as well as amount of
496	@ 128-bit instructions, which benefits low-end processors], but
497	@ one has to watch for H2 (which is narrower than H0) and 5*H4
498	@ not being wider than 58 bits, so that result of right shift
499	@ by 26 bits fits in 32 bits. This is also useful on x86,
500	@ because it allows to use paddd in place for paddq, which
501	@ benefits Atom, where paddq is ridiculously slow.
502
503	vshr.u64	q15,q8,#26
504	vmovn.i64	d16,q8
505	vshr.u64	q4,q5,#26
506	vmovn.i64	d10,q5
507	vadd.i64	q9,q9,q15		@ h3 -> h4
508	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
509	vadd.i64	q6,q6,q4		@ h0 -> h1
510	vbic.i32	d10,#0xfc000000
511
512	vshrn.u64	d30,q9,#26
513	vmovn.i64	d18,q9
514	vshr.u64	q4,q6,#26
515	vmovn.i64	d12,q6
516	vadd.i64	q7,q7,q4		@ h1 -> h2
517	vbic.i32	d18,#0xfc000000
518	vbic.i32	d12,#0xfc000000
519
520	vadd.i32	d10,d10,d30
521	vshl.u32	d30,d30,#2
522	vshrn.u64	d8,q7,#26
523	vmovn.i64	d14,q7
524	vadd.i32	d10,d10,d30	@ h4 -> h0
525	vadd.i32	d16,d16,d8	@ h2 -> h3
526	vbic.i32	d14,#0xfc000000
527
528	vshr.u32	d30,d10,#26
529	vbic.i32	d10,#0xfc000000
530	vshr.u32	d8,d16,#26
531	vbic.i32	d16,#0xfc000000
532	vadd.i32	d12,d12,d30	@ h0 -> h1
533	vadd.i32	d18,d18,d8	@ h3 -> h4
534
535	subs	r5,r5,#1
536	beq	.Lsquare_break_neon
537
538	add	r6,r0,#(48+0*9*4)
539	add	r7,r0,#(48+1*9*4)
540
541	vtrn.32	d0,d10		@ r^2:r^1
542	vtrn.32	d3,d14
543	vtrn.32	d5,d16
544	vtrn.32	d1,d12
545	vtrn.32	d7,d18
546
547	vshl.u32	d4,d3,#2		@ *5
548	vshl.u32	d6,d5,#2
549	vshl.u32	d2,d1,#2
550	vshl.u32	d8,d7,#2
551	vadd.i32	d4,d4,d3
552	vadd.i32	d2,d2,d1
553	vadd.i32	d6,d6,d5
554	vadd.i32	d8,d8,d7
555
556	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
557	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
558	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
559	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
560	vst1.32	{d8[0]},[r6,:32]
561	vst1.32	{d8[1]},[r7,:32]
562
563	b	.Lsquare_neon
564
565.align	4
566.Lsquare_break_neon:
567	add	r6,r0,#(48+2*4*9)
568	add	r7,r0,#(48+3*4*9)
569
570	vmov	d0,d10		@ r^4:r^3
571	vshl.u32	d2,d12,#2		@ *5
572	vmov	d1,d12
573	vshl.u32	d4,d14,#2
574	vmov	d3,d14
575	vshl.u32	d6,d16,#2
576	vmov	d5,d16
577	vshl.u32	d8,d18,#2
578	vmov	d7,d18
579	vadd.i32	d2,d2,d12
580	vadd.i32	d4,d4,d14
581	vadd.i32	d6,d6,d16
582	vadd.i32	d8,d8,d18
583
584	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
585	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
586	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
587	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
588	vst1.32	{d8[0]},[r6]
589	vst1.32	{d8[1]},[r7]
590
591	bx	lr				@ bx	lr
592.size	poly1305_init_neon,.-poly1305_init_neon
593
594.type	poly1305_blocks_neon,%function
595.align	5
596poly1305_blocks_neon:
597.Lpoly1305_blocks_neon:
598	ldr	ip,[r0,#36]		@ is_base2_26
599	ands	r2,r2,#-16
600	beq	.Lno_data_neon
601
602	cmp	r2,#64
603	bhs	.Lenter_neon
604	tst	ip,ip			@ is_base2_26?
605	beq	.Lpoly1305_blocks
606
607.Lenter_neon:
608	stmdb	sp!,{r4,r5,r6,r7}
609	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
610
611	tst	ip,ip			@ is_base2_26?
612	bne	.Lbase2_26_neon
613
614	stmdb	sp!,{r1,r2,r3,lr}
615	bl	poly1305_init_neon
616
617	ldr	r4,[r0,#0]		@ load hash value base 2^32
618	ldr	r5,[r0,#4]
619	ldr	r6,[r0,#8]
620	ldr	r7,[r0,#12]
621	ldr	ip,[r0,#16]
622
623	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
624	mov	r3,r4,lsr#26
625	veor	d10,d10,d10
626	mov	r4,r5,lsr#20
627	orr	r3,r3,r5,lsl#6
628	veor	d12,d12,d12
629	mov	r5,r6,lsr#14
630	orr	r4,r4,r6,lsl#12
631	veor	d14,d14,d14
632	mov	r6,r7,lsr#8
633	orr	r5,r5,r7,lsl#18
634	veor	d16,d16,d16
635	and	r3,r3,#0x03ffffff
636	orr	r6,r6,ip,lsl#24
637	veor	d18,d18,d18
638	and	r4,r4,#0x03ffffff
639	mov	r1,#1
640	and	r5,r5,#0x03ffffff
641	str	r1,[r0,#36]		@ is_base2_26
642
643	vmov.32	d10[0],r2
644	vmov.32	d12[0],r3
645	vmov.32	d14[0],r4
646	vmov.32	d16[0],r5
647	vmov.32	d18[0],r6
648	adr	r5,.Lzeros
649
650	ldmia	sp!,{r1,r2,r3,lr}
651	b	.Lbase2_32_neon
652
653.align	4
654.Lbase2_26_neon:
655	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
656	@ load hash value
657
658	veor	d10,d10,d10
659	veor	d12,d12,d12
660	veor	d14,d14,d14
661	veor	d16,d16,d16
662	veor	d18,d18,d18
663	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
664	adr	r5,.Lzeros
665	vld1.32	{d18[0]},[r0]
666	sub	r0,r0,#16		@ rewind
667
668.Lbase2_32_neon:
669	add	r4,r1,#32
670	mov	r3,r3,lsl#24
671	tst	r2,#31
672	beq	.Leven
673
674	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
675	vmov.32	d28[0],r3
676	sub	r2,r2,#16
677	add	r4,r1,#32
678
679# ifdef	__ARMEB__
680	vrev32.8	q10,q10
681	vrev32.8	q13,q13
682	vrev32.8	q11,q11
683	vrev32.8	q12,q12
684# endif
685	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
686	vshl.u32	d26,d26,#18
687
688	vsri.u32	d26,d24,#14
689	vshl.u32	d24,d24,#12
690	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
691
692	vbic.i32	d26,#0xfc000000
693	vsri.u32	d24,d22,#20
694	vshl.u32	d22,d22,#6
695
696	vbic.i32	d24,#0xfc000000
697	vsri.u32	d22,d20,#26
698	vadd.i32	d27,d26,d16
699
700	vbic.i32	d20,#0xfc000000
701	vbic.i32	d22,#0xfc000000
702	vadd.i32	d25,d24,d14
703
704	vadd.i32	d21,d20,d10
705	vadd.i32	d23,d22,d12
706
707	mov	r7,r5
708	add	r6,r0,#48
709
710	cmp	r2,r2
711	b	.Long_tail
712
713.align	4
714.Leven:
715	subs	r2,r2,#64
716	it	lo
717	movlo	r4,r5
718
719	vmov.i32	q14,#1<<24		@ padbit, yes, always
720	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
721	add	r1,r1,#64
722	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
723	add	r4,r4,#64
724	itt	hi
725	addhi	r7,r0,#(48+1*9*4)
726	addhi	r6,r0,#(48+3*9*4)
727
728# ifdef	__ARMEB__
729	vrev32.8	q10,q10
730	vrev32.8	q13,q13
731	vrev32.8	q11,q11
732	vrev32.8	q12,q12
733# endif
734	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
735	vshl.u32	q13,q13,#18
736
737	vsri.u32	q13,q12,#14
738	vshl.u32	q12,q12,#12
739
740	vbic.i32	q13,#0xfc000000
741	vsri.u32	q12,q11,#20
742	vshl.u32	q11,q11,#6
743
744	vbic.i32	q12,#0xfc000000
745	vsri.u32	q11,q10,#26
746
747	vbic.i32	q10,#0xfc000000
748	vbic.i32	q11,#0xfc000000
749
750	bls	.Lskip_loop
751
752	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
753	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
754	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
755	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
756	b	.Loop_neon
757
758.align	5
759.Loop_neon:
760	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
761	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
762	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
763	@   ___________________/
764	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
765	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
766	@   ___________________/ ____________________/
767	@
768	@ Note that we start with inp[2:3]*r^2. This is because it
769	@ doesn't depend on reduction in previous iteration.
770	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
771	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
772	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
773	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
774	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
775	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
776
777	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
778	@ inp[2:3]*r^2
779
780	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
781	vmull.u32	q7,d25,d0[1]
782	vadd.i32	d20,d20,d10
783	vmull.u32	q5,d21,d0[1]
784	vadd.i32	d26,d26,d16
785	vmull.u32	q8,d27,d0[1]
786	vmlal.u32	q7,d23,d1[1]
787	vadd.i32	d22,d22,d12
788	vmull.u32	q6,d23,d0[1]
789
790	vadd.i32	d28,d28,d18
791	vmull.u32	q9,d29,d0[1]
792	subs	r2,r2,#64
793	vmlal.u32	q5,d29,d2[1]
794	it	lo
795	movlo	r4,r5
796	vmlal.u32	q8,d25,d1[1]
797	vld1.32	d8[1],[r7,:32]
798	vmlal.u32	q6,d21,d1[1]
799	vmlal.u32	q9,d27,d1[1]
800
801	vmlal.u32	q5,d27,d4[1]
802	vmlal.u32	q8,d23,d3[1]
803	vmlal.u32	q9,d25,d3[1]
804	vmlal.u32	q6,d29,d4[1]
805	vmlal.u32	q7,d21,d3[1]
806
807	vmlal.u32	q8,d21,d5[1]
808	vmlal.u32	q5,d25,d6[1]
809	vmlal.u32	q9,d23,d5[1]
810	vmlal.u32	q6,d27,d6[1]
811	vmlal.u32	q7,d29,d6[1]
812
813	vmlal.u32	q8,d29,d8[1]
814	vmlal.u32	q5,d23,d8[1]
815	vmlal.u32	q9,d21,d7[1]
816	vmlal.u32	q6,d25,d8[1]
817	vmlal.u32	q7,d27,d8[1]
818
819	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
820	add	r4,r4,#64
821
822	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
823	@ (hash+inp[0:1])*r^4 and accumulate
824
825	vmlal.u32	q8,d26,d0[0]
826	vmlal.u32	q5,d20,d0[0]
827	vmlal.u32	q9,d28,d0[0]
828	vmlal.u32	q6,d22,d0[0]
829	vmlal.u32	q7,d24,d0[0]
830	vld1.32	d8[0],[r6,:32]
831
832	vmlal.u32	q8,d24,d1[0]
833	vmlal.u32	q5,d28,d2[0]
834	vmlal.u32	q9,d26,d1[0]
835	vmlal.u32	q6,d20,d1[0]
836	vmlal.u32	q7,d22,d1[0]
837
838	vmlal.u32	q8,d22,d3[0]
839	vmlal.u32	q5,d26,d4[0]
840	vmlal.u32	q9,d24,d3[0]
841	vmlal.u32	q6,d28,d4[0]
842	vmlal.u32	q7,d20,d3[0]
843
844	vmlal.u32	q8,d20,d5[0]
845	vmlal.u32	q5,d24,d6[0]
846	vmlal.u32	q9,d22,d5[0]
847	vmlal.u32	q6,d26,d6[0]
848	vmlal.u32	q8,d28,d8[0]
849
850	vmlal.u32	q7,d28,d6[0]
851	vmlal.u32	q5,d22,d8[0]
852	vmlal.u32	q9,d20,d7[0]
853	vmov.i32	q14,#1<<24		@ padbit, yes, always
854	vmlal.u32	q6,d24,d8[0]
855	vmlal.u32	q7,d26,d8[0]
856
857	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
858	add	r1,r1,#64
859# ifdef	__ARMEB__
860	vrev32.8	q10,q10
861	vrev32.8	q11,q11
862	vrev32.8	q12,q12
863	vrev32.8	q13,q13
864# endif
865
866	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
867	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
868	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
869
870	vshr.u64	q15,q8,#26
871	vmovn.i64	d16,q8
872	vshr.u64	q4,q5,#26
873	vmovn.i64	d10,q5
874	vadd.i64	q9,q9,q15		@ h3 -> h4
875	vbic.i32	d16,#0xfc000000
876	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
877	vadd.i64	q6,q6,q4		@ h0 -> h1
878	vshl.u32	q13,q13,#18
879	vbic.i32	d10,#0xfc000000
880
881	vshrn.u64	d30,q9,#26
882	vmovn.i64	d18,q9
883	vshr.u64	q4,q6,#26
884	vmovn.i64	d12,q6
885	vadd.i64	q7,q7,q4		@ h1 -> h2
886	vsri.u32	q13,q12,#14
887	vbic.i32	d18,#0xfc000000
888	vshl.u32	q12,q12,#12
889	vbic.i32	d12,#0xfc000000
890
891	vadd.i32	d10,d10,d30
892	vshl.u32	d30,d30,#2
893	vbic.i32	q13,#0xfc000000
894	vshrn.u64	d8,q7,#26
895	vmovn.i64	d14,q7
896	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
897	vsri.u32	q12,q11,#20
898	vadd.i32	d16,d16,d8	@ h2 -> h3
899	vshl.u32	q11,q11,#6
900	vbic.i32	d14,#0xfc000000
901	vbic.i32	q12,#0xfc000000
902
903	vshrn.u64	d30,q5,#26		@ re-narrow
904	vmovn.i64	d10,q5
905	vsri.u32	q11,q10,#26
906	vbic.i32	q10,#0xfc000000
907	vshr.u32	d8,d16,#26
908	vbic.i32	d16,#0xfc000000
909	vbic.i32	d10,#0xfc000000
910	vadd.i32	d12,d12,d30	@ h0 -> h1
911	vadd.i32	d18,d18,d8	@ h3 -> h4
912	vbic.i32	q11,#0xfc000000
913
914	bhi	.Loop_neon
915
916.Lskip_loop:
917	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
918	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
919
920	add	r7,r0,#(48+0*9*4)
921	add	r6,r0,#(48+1*9*4)
922	adds	r2,r2,#32
923	it	ne
924	movne	r2,#0
925	bne	.Long_tail
926
927	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
928	vadd.i32	d21,d20,d10
929	vadd.i32	d27,d26,d16
930	vadd.i32	d23,d22,d12
931	vadd.i32	d29,d28,d18
932
933.Long_tail:
934	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
935	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
936
937	vadd.i32	d24,d24,d14	@ can be redundant
938	vmull.u32	q7,d25,d0
939	vadd.i32	d20,d20,d10
940	vmull.u32	q5,d21,d0
941	vadd.i32	d26,d26,d16
942	vmull.u32	q8,d27,d0
943	vadd.i32	d22,d22,d12
944	vmull.u32	q6,d23,d0
945	vadd.i32	d28,d28,d18
946	vmull.u32	q9,d29,d0
947
948	vmlal.u32	q5,d29,d2
949	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
950	vmlal.u32	q8,d25,d1
951	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
952	vmlal.u32	q6,d21,d1
953	vmlal.u32	q9,d27,d1
954	vmlal.u32	q7,d23,d1
955
956	vmlal.u32	q8,d23,d3
957	vld1.32	d8[1],[r7,:32]
958	vmlal.u32	q5,d27,d4
959	vld1.32	d8[0],[r6,:32]
960	vmlal.u32	q9,d25,d3
961	vmlal.u32	q6,d29,d4
962	vmlal.u32	q7,d21,d3
963
964	vmlal.u32	q8,d21,d5
965	it	ne
966	addne	r7,r0,#(48+2*9*4)
967	vmlal.u32	q5,d25,d6
968	it	ne
969	addne	r6,r0,#(48+3*9*4)
970	vmlal.u32	q9,d23,d5
971	vmlal.u32	q6,d27,d6
972	vmlal.u32	q7,d29,d6
973
974	vmlal.u32	q8,d29,d8
975	vorn	q0,q0,q0	@ all-ones, can be redundant
976	vmlal.u32	q5,d23,d8
977	vshr.u64	q0,q0,#38
978	vmlal.u32	q9,d21,d7
979	vmlal.u32	q6,d25,d8
980	vmlal.u32	q7,d27,d8
981
982	beq	.Lshort_tail
983
984	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
985	@ (hash+inp[0:1])*r^4:r^3 and accumulate
986
987	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
988	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
989
990	vmlal.u32	q7,d24,d0
991	vmlal.u32	q5,d20,d0
992	vmlal.u32	q8,d26,d0
993	vmlal.u32	q6,d22,d0
994	vmlal.u32	q9,d28,d0
995
996	vmlal.u32	q5,d28,d2
997	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
998	vmlal.u32	q8,d24,d1
999	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1000	vmlal.u32	q6,d20,d1
1001	vmlal.u32	q9,d26,d1
1002	vmlal.u32	q7,d22,d1
1003
1004	vmlal.u32	q8,d22,d3
1005	vld1.32	d8[1],[r7,:32]
1006	vmlal.u32	q5,d26,d4
1007	vld1.32	d8[0],[r6,:32]
1008	vmlal.u32	q9,d24,d3
1009	vmlal.u32	q6,d28,d4
1010	vmlal.u32	q7,d20,d3
1011
1012	vmlal.u32	q8,d20,d5
1013	vmlal.u32	q5,d24,d6
1014	vmlal.u32	q9,d22,d5
1015	vmlal.u32	q6,d26,d6
1016	vmlal.u32	q7,d28,d6
1017
1018	vmlal.u32	q8,d28,d8
1019	vorn	q0,q0,q0	@ all-ones
1020	vmlal.u32	q5,d22,d8
1021	vshr.u64	q0,q0,#38
1022	vmlal.u32	q9,d20,d7
1023	vmlal.u32	q6,d24,d8
1024	vmlal.u32	q7,d26,d8
1025
1026.Lshort_tail:
1027	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1028	@ horizontal addition
1029
1030	vadd.i64	d16,d16,d17
1031	vadd.i64	d10,d10,d11
1032	vadd.i64	d18,d18,d19
1033	vadd.i64	d12,d12,d13
1034	vadd.i64	d14,d14,d15
1035
1036	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1037	@ lazy reduction, but without narrowing
1038
1039	vshr.u64	q15,q8,#26
1040	vand.i64	q8,q8,q0
1041	vshr.u64	q4,q5,#26
1042	vand.i64	q5,q5,q0
1043	vadd.i64	q9,q9,q15		@ h3 -> h4
1044	vadd.i64	q6,q6,q4		@ h0 -> h1
1045
1046	vshr.u64	q15,q9,#26
1047	vand.i64	q9,q9,q0
1048	vshr.u64	q4,q6,#26
1049	vand.i64	q6,q6,q0
1050	vadd.i64	q7,q7,q4		@ h1 -> h2
1051
1052	vadd.i64	q5,q5,q15
1053	vshl.u64	q15,q15,#2
1054	vshr.u64	q4,q7,#26
1055	vand.i64	q7,q7,q0
1056	vadd.i64	q5,q5,q15		@ h4 -> h0
1057	vadd.i64	q8,q8,q4		@ h2 -> h3
1058
1059	vshr.u64	q15,q5,#26
1060	vand.i64	q5,q5,q0
1061	vshr.u64	q4,q8,#26
1062	vand.i64	q8,q8,q0
1063	vadd.i64	q6,q6,q15		@ h0 -> h1
1064	vadd.i64	q9,q9,q4		@ h3 -> h4
1065
1066	cmp	r2,#0
1067	bne	.Leven
1068
1069	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1070	@ store hash value
1071
1072	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1073	vst1.32	{d18[0]},[r0]
1074
1075	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1076	ldmia	sp!,{r4,r5,r6,r7}
1077.Lno_data_neon:
1078	bx	lr					@ bx	lr
1079.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1080
1081.type	poly1305_emit_neon,%function
1082.align	5
1083poly1305_emit_neon:
1084.Lpoly1305_emit_neon:
1085	ldr	ip,[r0,#36]		@ is_base2_26
1086
1087	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1088
1089	tst	ip,ip
1090	beq	.Lpoly1305_emit_enter
1091
1092	ldmia	r0,{r3,r4,r5,r6,r7}
1093	eor	r8,r8,r8
1094
1095	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1096	mov	r4,r4,lsr#6
1097	adcs	r4,r4,r5,lsl#20
1098	mov	r5,r5,lsr#12
1099	adcs	r5,r5,r6,lsl#14
1100	mov	r6,r6,lsr#18
1101	adcs	r6,r6,r7,lsl#8
1102	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1103
1104	and	r8,r7,#-4		@ ... so reduce
1105	and	r7,r6,#3
1106	add	r8,r8,r8,lsr#2	@ *= 5
1107	adds	r3,r3,r8
1108	adcs	r4,r4,#0
1109	adcs	r5,r5,#0
1110	adcs	r6,r6,#0
1111	adc	r7,r7,#0
1112
1113	adds	r8,r3,#5		@ compare to modulus
1114	adcs	r9,r4,#0
1115	adcs	r10,r5,#0
1116	adcs	r11,r6,#0
1117	adc	r7,r7,#0
1118	tst	r7,#4			@ did it carry/borrow?
1119
1120	it	ne
1121	movne	r3,r8
1122	ldr	r8,[r2,#0]
1123	it	ne
1124	movne	r4,r9
1125	ldr	r9,[r2,#4]
1126	it	ne
1127	movne	r5,r10
1128	ldr	r10,[r2,#8]
1129	it	ne
1130	movne	r6,r11
1131	ldr	r11,[r2,#12]
1132
1133	adds	r3,r3,r8		@ accumulate nonce
1134	adcs	r4,r4,r9
1135	adcs	r5,r5,r10
1136	adc	r6,r6,r11
1137
1138# ifdef __ARMEB__
1139	rev	r3,r3
1140	rev	r4,r4
1141	rev	r5,r5
1142	rev	r6,r6
1143# endif
1144	str	r3,[r1,#0]		@ store the result
1145	str	r4,[r1,#4]
1146	str	r5,[r1,#8]
1147	str	r6,[r1,#12]
1148
1149	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1150	bx	lr				@ bx	lr
1151.size	poly1305_emit_neon,.-poly1305_emit_neon
1152
1153.align	5
1154.Lzeros:
1155.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1156.LOPENSSL_armcap:
1157# ifdef	_WIN32
1158.word	OPENSSL_armcap_P
1159# else
1160.word	OPENSSL_armcap_P-.Lpoly1305_init
1161# endif
1162#endif
1163.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1164.align	2
1165.align	2
1166#if	__ARM_MAX_ARCH__>=7
1167.comm	OPENSSL_armcap_P,4,4
1168#endif
1169