• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
18@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
19@ instructions are in aesv8-armx.pl.)
20
21
22.text
23#if defined(__thumb2__) || defined(__clang__)
24.syntax	unified
25#define ldrplb  ldrbpl
26#define ldrneb  ldrbne
27#endif
28#if defined(__thumb2__)
29.thumb
30#else
31.code	32
32#endif
33
34
35.align	5
36rem_4bit:
37.short	0x0000,0x1C20,0x3840,0x2460
38.short	0x7080,0x6CA0,0x48C0,0x54E0
39.short	0xE100,0xFD20,0xD940,0xC560
40.short	0x9180,0x8DA0,0xA9C0,0xB5E0
41
42
43#ifdef __thumb2__
44.thumb_func	rem_4bit_get
45#endif
46rem_4bit_get:
47#if defined(__thumb2__)
48	adr	r2,rem_4bit
49#else
50	sub	r2,pc,#8+32	@ &rem_4bit
51#endif
52	b	Lrem_4bit_got
53	nop
54	nop
55
56
57.globl	_gcm_ghash_4bit
58.private_extern	_gcm_ghash_4bit
59#ifdef __thumb2__
60.thumb_func	_gcm_ghash_4bit
61#endif
62.align	4
63_gcm_ghash_4bit:
64#if defined(__thumb2__)
65	adr	r12,rem_4bit
66#else
67	sub	r12,pc,#8+48		@ &rem_4bit
68#endif
69	add	r3,r2,r3		@ r3 to point at the end
70	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
71
72	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
73	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
74
75	ldrb	r12,[r2,#15]
76	ldrb	r14,[r0,#15]
77Louter:
78	eor	r12,r12,r14
79	and	r14,r12,#0xf0
80	and	r12,r12,#0x0f
81	mov	r3,#14
82
83	add	r7,r1,r12,lsl#4
84	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
85	add	r11,r1,r14
86	ldrb	r12,[r2,#14]
87
88	and	r14,r4,#0xf		@ rem
89	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
90	add	r14,r14,r14
91	eor	r4,r8,r4,lsr#4
92	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
93	eor	r4,r4,r5,lsl#28
94	ldrb	r14,[r0,#14]
95	eor	r5,r9,r5,lsr#4
96	eor	r5,r5,r6,lsl#28
97	eor	r6,r10,r6,lsr#4
98	eor	r6,r6,r7,lsl#28
99	eor	r7,r11,r7,lsr#4
100	eor	r12,r12,r14
101	and	r14,r12,#0xf0
102	and	r12,r12,#0x0f
103	eor	r7,r7,r8,lsl#16
104
105Linner:
106	add	r11,r1,r12,lsl#4
107	and	r12,r4,#0xf		@ rem
108	subs	r3,r3,#1
109	add	r12,r12,r12
110	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
111	eor	r4,r8,r4,lsr#4
112	eor	r4,r4,r5,lsl#28
113	eor	r5,r9,r5,lsr#4
114	eor	r5,r5,r6,lsl#28
115	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
116	eor	r6,r10,r6,lsr#4
117#ifdef	__thumb2__
118	it	pl
119#endif
120	ldrplb	r12,[r2,r3]
121	eor	r6,r6,r7,lsl#28
122	eor	r7,r11,r7,lsr#4
123
124	add	r11,r1,r14
125	and	r14,r4,#0xf		@ rem
126	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
127	add	r14,r14,r14
128	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
129	eor	r4,r8,r4,lsr#4
130#ifdef	__thumb2__
131	it	pl
132#endif
133	ldrplb	r8,[r0,r3]
134	eor	r4,r4,r5,lsl#28
135	eor	r5,r9,r5,lsr#4
136	ldrh	r9,[sp,r14]
137	eor	r5,r5,r6,lsl#28
138	eor	r6,r10,r6,lsr#4
139	eor	r6,r6,r7,lsl#28
140#ifdef	__thumb2__
141	it	pl
142#endif
143	eorpl	r12,r12,r8
144	eor	r7,r11,r7,lsr#4
145#ifdef	__thumb2__
146	itt	pl
147#endif
148	andpl	r14,r12,#0xf0
149	andpl	r12,r12,#0x0f
150	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
151	bpl	Linner
152
153	ldr	r3,[sp,#32]		@ re-load r3/end
154	add	r2,r2,#16
155	mov	r14,r4
156#if __ARM_ARCH__>=7 && defined(__ARMEL__)
157	rev	r4,r4
158	str	r4,[r0,#12]
159#elif defined(__ARMEB__)
160	str	r4,[r0,#12]
161#else
162	mov	r9,r4,lsr#8
163	strb	r4,[r0,#12+3]
164	mov	r10,r4,lsr#16
165	strb	r9,[r0,#12+2]
166	mov	r11,r4,lsr#24
167	strb	r10,[r0,#12+1]
168	strb	r11,[r0,#12]
169#endif
170	cmp	r2,r3
171#if __ARM_ARCH__>=7 && defined(__ARMEL__)
172	rev	r5,r5
173	str	r5,[r0,#8]
174#elif defined(__ARMEB__)
175	str	r5,[r0,#8]
176#else
177	mov	r9,r5,lsr#8
178	strb	r5,[r0,#8+3]
179	mov	r10,r5,lsr#16
180	strb	r9,[r0,#8+2]
181	mov	r11,r5,lsr#24
182	strb	r10,[r0,#8+1]
183	strb	r11,[r0,#8]
184#endif
185
186#ifdef __thumb2__
187	it	ne
188#endif
189	ldrneb	r12,[r2,#15]
190#if __ARM_ARCH__>=7 && defined(__ARMEL__)
191	rev	r6,r6
192	str	r6,[r0,#4]
193#elif defined(__ARMEB__)
194	str	r6,[r0,#4]
195#else
196	mov	r9,r6,lsr#8
197	strb	r6,[r0,#4+3]
198	mov	r10,r6,lsr#16
199	strb	r9,[r0,#4+2]
200	mov	r11,r6,lsr#24
201	strb	r10,[r0,#4+1]
202	strb	r11,[r0,#4]
203#endif
204
205#if __ARM_ARCH__>=7 && defined(__ARMEL__)
206	rev	r7,r7
207	str	r7,[r0,#0]
208#elif defined(__ARMEB__)
209	str	r7,[r0,#0]
210#else
211	mov	r9,r7,lsr#8
212	strb	r7,[r0,#0+3]
213	mov	r10,r7,lsr#16
214	strb	r9,[r0,#0+2]
215	mov	r11,r7,lsr#24
216	strb	r10,[r0,#0+1]
217	strb	r11,[r0,#0]
218#endif
219
220	bne	Louter
221
222	add	sp,sp,#36
223#if __ARM_ARCH__>=5
224	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
225#else
226	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
227	tst	lr,#1
228	moveq	pc,lr			@ be binary compatible with V4, yet
229.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
230#endif
231
232
233.globl	_gcm_gmult_4bit
234.private_extern	_gcm_gmult_4bit
235#ifdef __thumb2__
236.thumb_func	_gcm_gmult_4bit
237#endif
238_gcm_gmult_4bit:
239	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
240	ldrb	r12,[r0,#15]
241	b	rem_4bit_get
242Lrem_4bit_got:
243	and	r14,r12,#0xf0
244	and	r12,r12,#0x0f
245	mov	r3,#14
246
247	add	r7,r1,r12,lsl#4
248	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
249	ldrb	r12,[r0,#14]
250
251	add	r11,r1,r14
252	and	r14,r4,#0xf		@ rem
253	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
254	add	r14,r14,r14
255	eor	r4,r8,r4,lsr#4
256	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
257	eor	r4,r4,r5,lsl#28
258	eor	r5,r9,r5,lsr#4
259	eor	r5,r5,r6,lsl#28
260	eor	r6,r10,r6,lsr#4
261	eor	r6,r6,r7,lsl#28
262	eor	r7,r11,r7,lsr#4
263	and	r14,r12,#0xf0
264	eor	r7,r7,r8,lsl#16
265	and	r12,r12,#0x0f
266
267Loop:
268	add	r11,r1,r12,lsl#4
269	and	r12,r4,#0xf		@ rem
270	subs	r3,r3,#1
271	add	r12,r12,r12
272	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
273	eor	r4,r8,r4,lsr#4
274	eor	r4,r4,r5,lsl#28
275	eor	r5,r9,r5,lsr#4
276	eor	r5,r5,r6,lsl#28
277	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
278	eor	r6,r10,r6,lsr#4
279#ifdef	__thumb2__
280	it	pl
281#endif
282	ldrplb	r12,[r0,r3]
283	eor	r6,r6,r7,lsl#28
284	eor	r7,r11,r7,lsr#4
285
286	add	r11,r1,r14
287	and	r14,r4,#0xf		@ rem
288	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
289	add	r14,r14,r14
290	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
291	eor	r4,r8,r4,lsr#4
292	eor	r4,r4,r5,lsl#28
293	eor	r5,r9,r5,lsr#4
294	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
295	eor	r5,r5,r6,lsl#28
296	eor	r6,r10,r6,lsr#4
297	eor	r6,r6,r7,lsl#28
298	eor	r7,r11,r7,lsr#4
299#ifdef	__thumb2__
300	itt	pl
301#endif
302	andpl	r14,r12,#0xf0
303	andpl	r12,r12,#0x0f
304	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
305	bpl	Loop
306#if __ARM_ARCH__>=7 && defined(__ARMEL__)
307	rev	r4,r4
308	str	r4,[r0,#12]
309#elif defined(__ARMEB__)
310	str	r4,[r0,#12]
311#else
312	mov	r9,r4,lsr#8
313	strb	r4,[r0,#12+3]
314	mov	r10,r4,lsr#16
315	strb	r9,[r0,#12+2]
316	mov	r11,r4,lsr#24
317	strb	r10,[r0,#12+1]
318	strb	r11,[r0,#12]
319#endif
320
321#if __ARM_ARCH__>=7 && defined(__ARMEL__)
322	rev	r5,r5
323	str	r5,[r0,#8]
324#elif defined(__ARMEB__)
325	str	r5,[r0,#8]
326#else
327	mov	r9,r5,lsr#8
328	strb	r5,[r0,#8+3]
329	mov	r10,r5,lsr#16
330	strb	r9,[r0,#8+2]
331	mov	r11,r5,lsr#24
332	strb	r10,[r0,#8+1]
333	strb	r11,[r0,#8]
334#endif
335
336#if __ARM_ARCH__>=7 && defined(__ARMEL__)
337	rev	r6,r6
338	str	r6,[r0,#4]
339#elif defined(__ARMEB__)
340	str	r6,[r0,#4]
341#else
342	mov	r9,r6,lsr#8
343	strb	r6,[r0,#4+3]
344	mov	r10,r6,lsr#16
345	strb	r9,[r0,#4+2]
346	mov	r11,r6,lsr#24
347	strb	r10,[r0,#4+1]
348	strb	r11,[r0,#4]
349#endif
350
351#if __ARM_ARCH__>=7 && defined(__ARMEL__)
352	rev	r7,r7
353	str	r7,[r0,#0]
354#elif defined(__ARMEB__)
355	str	r7,[r0,#0]
356#else
357	mov	r9,r7,lsr#8
358	strb	r7,[r0,#0+3]
359	mov	r10,r7,lsr#16
360	strb	r9,[r0,#0+2]
361	mov	r11,r7,lsr#24
362	strb	r10,[r0,#0+1]
363	strb	r11,[r0,#0]
364#endif
365
366#if __ARM_ARCH__>=5
367	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
368#else
369	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
370	tst	lr,#1
371	moveq	pc,lr			@ be binary compatible with V4, yet
372.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
373#endif
374
375#if __ARM_MAX_ARCH__>=7
376
377
378
379.globl	_gcm_init_neon
380.private_extern	_gcm_init_neon
381#ifdef __thumb2__
382.thumb_func	_gcm_init_neon
383#endif
384.align	4
385_gcm_init_neon:
386	vld1.64	d7,[r1]!		@ load H
387	vmov.i8	q8,#0xe1
388	vld1.64	d6,[r1]
389	vshl.i64	d17,#57
390	vshr.u64	d16,#63		@ t0=0xc2....01
391	vdup.8	q9,d7[7]
392	vshr.u64	d26,d6,#63
393	vshr.s8	q9,#7			@ broadcast carry bit
394	vshl.i64	q3,q3,#1
395	vand	q8,q8,q9
396	vorr	d7,d26		@ H<<<=1
397	veor	q3,q3,q8		@ twisted H
398	vstmia	r0,{q3}
399
400	bx	lr					@ bx lr
401
402
403.globl	_gcm_gmult_neon
404.private_extern	_gcm_gmult_neon
405#ifdef __thumb2__
406.thumb_func	_gcm_gmult_neon
407#endif
408.align	4
409_gcm_gmult_neon:
410	vld1.64	d7,[r0]!		@ load Xi
411	vld1.64	d6,[r0]!
412	vmov.i64	d29,#0x0000ffffffffffff
413	vldmia	r1,{d26,d27}	@ load twisted H
414	vmov.i64	d30,#0x00000000ffffffff
415#ifdef __ARMEL__
416	vrev64.8	q3,q3
417#endif
418	vmov.i64	d31,#0x000000000000ffff
419	veor	d28,d26,d27		@ Karatsuba pre-processing
420	mov	r3,#16
421	b	Lgmult_neon
422
423
424.globl	_gcm_ghash_neon
425.private_extern	_gcm_ghash_neon
426#ifdef __thumb2__
427.thumb_func	_gcm_ghash_neon
428#endif
429.align	4
430_gcm_ghash_neon:
431	vld1.64	d1,[r0]!		@ load Xi
432	vld1.64	d0,[r0]!
433	vmov.i64	d29,#0x0000ffffffffffff
434	vldmia	r1,{d26,d27}	@ load twisted H
435	vmov.i64	d30,#0x00000000ffffffff
436#ifdef __ARMEL__
437	vrev64.8	q0,q0
438#endif
439	vmov.i64	d31,#0x000000000000ffff
440	veor	d28,d26,d27		@ Karatsuba pre-processing
441
442Loop_neon:
443	vld1.64	d7,[r2]!		@ load inp
444	vld1.64	d6,[r2]!
445#ifdef __ARMEL__
446	vrev64.8	q3,q3
447#endif
448	veor	q3,q0			@ inp^=Xi
449Lgmult_neon:
450	vext.8	d16, d26, d26, #1	@ A1
451	vmull.p8	q8, d16, d6		@ F = A1*B
452	vext.8	d0, d6, d6, #1	@ B1
453	vmull.p8	q0, d26, d0		@ E = A*B1
454	vext.8	d18, d26, d26, #2	@ A2
455	vmull.p8	q9, d18, d6		@ H = A2*B
456	vext.8	d22, d6, d6, #2	@ B2
457	vmull.p8	q11, d26, d22		@ G = A*B2
458	vext.8	d20, d26, d26, #3	@ A3
459	veor	q8, q8, q0		@ L = E + F
460	vmull.p8	q10, d20, d6		@ J = A3*B
461	vext.8	d0, d6, d6, #3	@ B3
462	veor	q9, q9, q11		@ M = G + H
463	vmull.p8	q0, d26, d0		@ I = A*B3
464	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
465	vand	d17, d17, d29
466	vext.8	d22, d6, d6, #4	@ B4
467	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
468	vand	d19, d19, d30
469	vmull.p8	q11, d26, d22		@ K = A*B4
470	veor	q10, q10, q0		@ N = I + J
471	veor	d16, d16, d17
472	veor	d18, d18, d19
473	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
474	vand	d21, d21, d31
475	vext.8	q8, q8, q8, #15
476	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
477	vmov.i64	d23, #0
478	vext.8	q9, q9, q9, #14
479	veor	d20, d20, d21
480	vmull.p8	q0, d26, d6		@ D = A*B
481	vext.8	q11, q11, q11, #12
482	vext.8	q10, q10, q10, #13
483	veor	q8, q8, q9
484	veor	q10, q10, q11
485	veor	q0, q0, q8
486	veor	q0, q0, q10
487	veor	d6,d6,d7	@ Karatsuba pre-processing
488	vext.8	d16, d28, d28, #1	@ A1
489	vmull.p8	q8, d16, d6		@ F = A1*B
490	vext.8	d2, d6, d6, #1	@ B1
491	vmull.p8	q1, d28, d2		@ E = A*B1
492	vext.8	d18, d28, d28, #2	@ A2
493	vmull.p8	q9, d18, d6		@ H = A2*B
494	vext.8	d22, d6, d6, #2	@ B2
495	vmull.p8	q11, d28, d22		@ G = A*B2
496	vext.8	d20, d28, d28, #3	@ A3
497	veor	q8, q8, q1		@ L = E + F
498	vmull.p8	q10, d20, d6		@ J = A3*B
499	vext.8	d2, d6, d6, #3	@ B3
500	veor	q9, q9, q11		@ M = G + H
501	vmull.p8	q1, d28, d2		@ I = A*B3
502	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
503	vand	d17, d17, d29
504	vext.8	d22, d6, d6, #4	@ B4
505	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
506	vand	d19, d19, d30
507	vmull.p8	q11, d28, d22		@ K = A*B4
508	veor	q10, q10, q1		@ N = I + J
509	veor	d16, d16, d17
510	veor	d18, d18, d19
511	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
512	vand	d21, d21, d31
513	vext.8	q8, q8, q8, #15
514	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
515	vmov.i64	d23, #0
516	vext.8	q9, q9, q9, #14
517	veor	d20, d20, d21
518	vmull.p8	q1, d28, d6		@ D = A*B
519	vext.8	q11, q11, q11, #12
520	vext.8	q10, q10, q10, #13
521	veor	q8, q8, q9
522	veor	q10, q10, q11
523	veor	q1, q1, q8
524	veor	q1, q1, q10
525	vext.8	d16, d27, d27, #1	@ A1
526	vmull.p8	q8, d16, d7		@ F = A1*B
527	vext.8	d4, d7, d7, #1	@ B1
528	vmull.p8	q2, d27, d4		@ E = A*B1
529	vext.8	d18, d27, d27, #2	@ A2
530	vmull.p8	q9, d18, d7		@ H = A2*B
531	vext.8	d22, d7, d7, #2	@ B2
532	vmull.p8	q11, d27, d22		@ G = A*B2
533	vext.8	d20, d27, d27, #3	@ A3
534	veor	q8, q8, q2		@ L = E + F
535	vmull.p8	q10, d20, d7		@ J = A3*B
536	vext.8	d4, d7, d7, #3	@ B3
537	veor	q9, q9, q11		@ M = G + H
538	vmull.p8	q2, d27, d4		@ I = A*B3
539	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
540	vand	d17, d17, d29
541	vext.8	d22, d7, d7, #4	@ B4
542	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
543	vand	d19, d19, d30
544	vmull.p8	q11, d27, d22		@ K = A*B4
545	veor	q10, q10, q2		@ N = I + J
546	veor	d16, d16, d17
547	veor	d18, d18, d19
548	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
549	vand	d21, d21, d31
550	vext.8	q8, q8, q8, #15
551	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
552	vmov.i64	d23, #0
553	vext.8	q9, q9, q9, #14
554	veor	d20, d20, d21
555	vmull.p8	q2, d27, d7		@ D = A*B
556	vext.8	q11, q11, q11, #12
557	vext.8	q10, q10, q10, #13
558	veor	q8, q8, q9
559	veor	q10, q10, q11
560	veor	q2, q2, q8
561	veor	q2, q2, q10
562	veor	q1,q1,q0		@ Karatsuba post-processing
563	veor	q1,q1,q2
564	veor	d1,d1,d2
565	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
566
567	@ equivalent of reduction_avx from ghash-x86_64.pl
568	vshl.i64	q9,q0,#57		@ 1st phase
569	vshl.i64	q10,q0,#62
570	veor	q10,q10,q9		@
571	vshl.i64	q9,q0,#63
572	veor	q10, q10, q9		@
573	veor	d1,d1,d20	@
574	veor	d4,d4,d21
575
576	vshr.u64	q10,q0,#1		@ 2nd phase
577	veor	q2,q2,q0
578	veor	q0,q0,q10		@
579	vshr.u64	q10,q10,#6
580	vshr.u64	q0,q0,#1		@
581	veor	q0,q0,q2		@
582	veor	q0,q0,q10		@
583
584	subs	r3,#16
585	bne	Loop_neon
586
587#ifdef __ARMEL__
588	vrev64.8	q0,q0
589#endif
590	sub	r0,#16
591	vst1.64	d1,[r0]!		@ write out Xi
592	vst1.64	d0,[r0]
593
594	bx	lr					@ bx lr
595
596#endif
597.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
598.align	2
599.align	2
600#endif  // !OPENSSL_NO_ASM
601