• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__arm__)
13#include "ring_core_generated/prefix_symbols_asm.h"
14#include <ring-core/arm_arch.h>
15
16.text
17.fpu	neon
18.code	32
19#undef	__thumb2__
20.globl	gcm_init_clmul
21.hidden	gcm_init_clmul
22.type	gcm_init_clmul,%function
23.align	4
24gcm_init_clmul:
25	AARCH64_VALID_CALL_TARGET
26	vld1.64	{q9},[r1]		@ load input H
27	vmov.i8	q11,#0xe1
28	vshl.i64	q11,q11,#57		@ 0xc2.0
29	vext.8	q3,q9,q9,#8
30	vshr.u64	q10,q11,#63
31	vdup.32	q9,d18[1]
32	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
33	vshr.u64	q10,q3,#63
34	vshr.s32	q9,q9,#31		@ broadcast carry bit
35	vand	q10,q10,q8
36	vshl.i64	q3,q3,#1
37	vext.8	q10,q10,q10,#8
38	vand	q8,q8,q9
39	vorr	q3,q3,q10		@ H<<<=1
40	veor	q12,q3,q8		@ twisted H
41	vst1.64	{q12},[r0]!		@ store Htable[0]
42
43	@ calculate H^2
44	vext.8	q8,q12,q12,#8		@ Karatsuba pre-processing
45.byte	0xa8,0x0e,0xa8,0xf2	@ pmull q0,q12,q12
46	veor	q8,q8,q12
47.byte	0xa9,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q12
48.byte	0xa0,0x2e,0xa0,0xf2	@ pmull q1,q8,q8
49
50	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
51	veor	q10,q0,q2
52	veor	q1,q1,q9
53	veor	q1,q1,q10
54.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase
55
56	vmov	d4,d3		@ Xh|Xm - 256-bit result
57	vmov	d3,d0		@ Xm is rotated Xl
58	veor	q0,q1,q10
59
60	vext.8	q10,q0,q0,#8		@ 2nd phase
61.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
62	veor	q10,q10,q2
63	veor	q14,q0,q10
64
65	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
66	veor	q9,q9,q14
67	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
68	vst1.64	{q13,q14},[r0]		@ store Htable[1..2]
69
70	bx	lr
71.size	gcm_init_clmul,.-gcm_init_clmul
72.globl	gcm_gmult_clmul
73.hidden	gcm_gmult_clmul
74.type	gcm_gmult_clmul,%function
75.align	4
76gcm_gmult_clmul:
77	AARCH64_VALID_CALL_TARGET
78	vld1.64	{q9},[r0]		@ load Xi
79	vmov.i8	q11,#0xe1
80	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
81	vshl.u64	q11,q11,#57
82#ifndef __ARMEB__
83	vrev64.8	q9,q9
84#endif
85	vext.8	q3,q9,q9,#8
86
87.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
88	veor	q9,q9,q3		@ Karatsuba pre-processing
89.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
90.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
91
92	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
93	veor	q10,q0,q2
94	veor	q1,q1,q9
95	veor	q1,q1,q10
96.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
97
98	vmov	d4,d3		@ Xh|Xm - 256-bit result
99	vmov	d3,d0		@ Xm is rotated Xl
100	veor	q0,q1,q10
101
102	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
103.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
104	veor	q10,q10,q2
105	veor	q0,q0,q10
106
107#ifndef __ARMEB__
108	vrev64.8	q0,q0
109#endif
110	vext.8	q0,q0,q0,#8
111	vst1.64	{q0},[r0]		@ write out Xi
112
113	bx	lr
114.size	gcm_gmult_clmul,.-gcm_gmult_clmul
115.globl	gcm_ghash_clmul
116.hidden	gcm_ghash_clmul
117.type	gcm_ghash_clmul,%function
118.align	4
119gcm_ghash_clmul:
120	AARCH64_VALID_CALL_TARGET
121	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
122	vld1.64	{q0},[r0]		@ load [rotated] Xi
123						@ "[rotated]" means that
124						@ loaded value would have
125						@ to be rotated in order to
126						@ make it appear as in
127						@ algorithm specification
128	subs	r3,r3,#32		@ see if r3 is 32 or larger
129	mov	r12,#16		@ r12 is used as post-
130						@ increment for input pointer;
131						@ as loop is modulo-scheduled
132						@ r12 is zeroed just in time
133						@ to preclude overstepping
134						@ inp[len], which means that
135						@ last block[s] are actually
136						@ loaded twice, but last
137						@ copy is not processed
138	vld1.64	{q12,q13},[r1]!	@ load twisted H, ..., H^2
139	vmov.i8	q11,#0xe1
140	vld1.64	{q14},[r1]
141	moveq	r12,#0			@ is it time to zero r12?
142	vext.8	q0,q0,q0,#8		@ rotate Xi
143	vld1.64	{q8},[r2]!	@ load [rotated] I[0]
144	vshl.u64	q11,q11,#57		@ compose 0xc2.0 constant
145#ifndef __ARMEB__
146	vrev64.8	q8,q8
147	vrev64.8	q0,q0
148#endif
149	vext.8	q3,q8,q8,#8		@ rotate I[0]
150	blo	.Lodd_tail_v8		@ r3 was less than 32
151	vld1.64	{q9},[r2],r12	@ load [rotated] I[1]
152#ifndef __ARMEB__
153	vrev64.8	q9,q9
154#endif
155	vext.8	q7,q9,q9,#8
156	veor	q3,q3,q0		@ I[i]^=Xi
157.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
158	veor	q9,q9,q7		@ Karatsuba pre-processing
159.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
160	b	.Loop_mod2x_v8
161
162.align	4
163.Loop_mod2x_v8:
164	vext.8	q10,q3,q3,#8
165	subs	r3,r3,#32		@ is there more data?
166.byte	0x86,0x0e,0xac,0xf2	@ pmull q0,q14,q3		@ H^2.lo·Xi.lo
167	movlo	r12,#0			@ is it time to zero r12?
168
169.byte	0xa2,0xae,0xaa,0xf2	@ pmull q5,q13,q9
170	veor	q10,q10,q3		@ Karatsuba pre-processing
171.byte	0x87,0x4e,0xad,0xf2	@ pmull2 q2,q14,q3		@ H^2.hi·Xi.hi
172	veor	q0,q0,q4		@ accumulate
173.byte	0xa5,0x2e,0xab,0xf2	@ pmull2 q1,q13,q10		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
174	vld1.64	{q8},[r2],r12	@ load [rotated] I[i+2]
175
176	veor	q2,q2,q6
177	moveq	r12,#0			@ is it time to zero r12?
178	veor	q1,q1,q5
179
180	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
181	veor	q10,q0,q2
182	veor	q1,q1,q9
183	vld1.64	{q9},[r2],r12	@ load [rotated] I[i+3]
184#ifndef __ARMEB__
185	vrev64.8	q8,q8
186#endif
187	veor	q1,q1,q10
188.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
189
190#ifndef __ARMEB__
191	vrev64.8	q9,q9
192#endif
193	vmov	d4,d3		@ Xh|Xm - 256-bit result
194	vmov	d3,d0		@ Xm is rotated Xl
195	vext.8	q7,q9,q9,#8
196	vext.8	q3,q8,q8,#8
197	veor	q0,q1,q10
198.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
199	veor	q3,q3,q2		@ accumulate q3 early
200
201	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
202.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
203	veor	q3,q3,q10
204	veor	q9,q9,q7		@ Karatsuba pre-processing
205	veor	q3,q3,q0
206.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
207	bhs	.Loop_mod2x_v8		@ there was at least 32 more bytes
208
209	veor	q2,q2,q10
210	vext.8	q3,q8,q8,#8		@ re-construct q3
211	adds	r3,r3,#32		@ re-construct r3
212	veor	q0,q0,q2		@ re-construct q0
213	beq	.Ldone_v8		@ is r3 zero?
214.Lodd_tail_v8:
215	vext.8	q10,q0,q0,#8
216	veor	q3,q3,q0		@ inp^=Xi
217	veor	q9,q8,q10		@ q9 is rotated inp^Xi
218
219.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
220	veor	q9,q9,q3		@ Karatsuba pre-processing
221.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
222.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
223
224	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
225	veor	q10,q0,q2
226	veor	q1,q1,q9
227	veor	q1,q1,q10
228.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
229
230	vmov	d4,d3		@ Xh|Xm - 256-bit result
231	vmov	d3,d0		@ Xm is rotated Xl
232	veor	q0,q1,q10
233
234	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
235.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
236	veor	q10,q10,q2
237	veor	q0,q0,q10
238
239.Ldone_v8:
240#ifndef __ARMEB__
241	vrev64.8	q0,q0
242#endif
243	vext.8	q0,q0,q0,#8
244	vst1.64	{q0},[r0]		@ write out Xi
245
246	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
247	bx	lr
248.size	gcm_ghash_clmul,.-gcm_ghash_clmul
249.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
250.align	2
251.align	2
252#endif
253#endif  // !OPENSSL_NO_ASM
254.section	.note.GNU-stack,"",%progbits
255