• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17.text
18
19.globl	_gcm_init_v8
20.private_extern	_gcm_init_v8
21
22.align	4
23_gcm_init_v8:
24	AARCH64_VALID_CALL_TARGET
25	ld1	{v17.2d},[x1]		//load input H
26	movi	v19.16b,#0xe1
27	shl	v19.2d,v19.2d,#57		//0xc2.0
28	ext	v3.16b,v17.16b,v17.16b,#8
29	ushr	v18.2d,v19.2d,#63
30	dup	v17.4s,v17.s[1]
31	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
32	ushr	v18.2d,v3.2d,#63
33	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
34	and	v18.16b,v18.16b,v16.16b
35	shl	v3.2d,v3.2d,#1
36	ext	v18.16b,v18.16b,v18.16b,#8
37	and	v16.16b,v16.16b,v17.16b
38	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
39	eor	v20.16b,v3.16b,v16.16b		//twisted H
40	st1	{v20.2d},[x0],#16		//store Htable[0]
41
42	//calculate H^2
43	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
44	pmull	v0.1q,v20.1d,v20.1d
45	eor	v16.16b,v16.16b,v20.16b
46	pmull2	v2.1q,v20.2d,v20.2d
47	pmull	v1.1q,v16.1d,v16.1d
48
49	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
50	eor	v18.16b,v0.16b,v2.16b
51	eor	v1.16b,v1.16b,v17.16b
52	eor	v1.16b,v1.16b,v18.16b
53	pmull	v18.1q,v0.1d,v19.1d		//1st phase
54
55	ins	v2.d[0],v1.d[1]
56	ins	v1.d[1],v0.d[0]
57	eor	v0.16b,v1.16b,v18.16b
58
59	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
60	pmull	v0.1q,v0.1d,v19.1d
61	eor	v18.16b,v18.16b,v2.16b
62	eor	v22.16b,v0.16b,v18.16b
63
64	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
65	eor	v17.16b,v17.16b,v22.16b
66	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
67	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
68
69	ret
70
71.globl	_gcm_gmult_v8
72.private_extern	_gcm_gmult_v8
73
74.align	4
75_gcm_gmult_v8:
76	AARCH64_VALID_CALL_TARGET
77	ld1	{v17.2d},[x0]		//load Xi
78	movi	v19.16b,#0xe1
79	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
80	shl	v19.2d,v19.2d,#57
81#ifndef __ARMEB__
82	rev64	v17.16b,v17.16b
83#endif
84	ext	v3.16b,v17.16b,v17.16b,#8
85
86	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
87	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
88	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
89	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
90
91	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
92	eor	v18.16b,v0.16b,v2.16b
93	eor	v1.16b,v1.16b,v17.16b
94	eor	v1.16b,v1.16b,v18.16b
95	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
96
97	ins	v2.d[0],v1.d[1]
98	ins	v1.d[1],v0.d[0]
99	eor	v0.16b,v1.16b,v18.16b
100
101	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
102	pmull	v0.1q,v0.1d,v19.1d
103	eor	v18.16b,v18.16b,v2.16b
104	eor	v0.16b,v0.16b,v18.16b
105
106#ifndef __ARMEB__
107	rev64	v0.16b,v0.16b
108#endif
109	ext	v0.16b,v0.16b,v0.16b,#8
110	st1	{v0.2d},[x0]		//write out Xi
111
112	ret
113
114.globl	_gcm_ghash_v8
115.private_extern	_gcm_ghash_v8
116
117.align	4
118_gcm_ghash_v8:
119	AARCH64_VALID_CALL_TARGET
120	ld1	{v0.2d},[x0]		//load [rotated] Xi
121						//"[rotated]" means that
122						//loaded value would have
123						//to be rotated in order to
124						//make it appear as in
125						//algorithm specification
126	subs	x3,x3,#32		//see if x3 is 32 or larger
127	mov	x12,#16		//x12 is used as post-
128						//increment for input pointer;
129						//as loop is modulo-scheduled
130						//x12 is zeroed just in time
131						//to preclude overstepping
132						//inp[len], which means that
133						//last block[s] are actually
134						//loaded twice, but last
135						//copy is not processed
136	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
137	movi	v19.16b,#0xe1
138	ld1	{v22.2d},[x1]
139	csel	x12,xzr,x12,eq			//is it time to zero x12?
140	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
141	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
142	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
143#ifndef __ARMEB__
144	rev64	v16.16b,v16.16b
145	rev64	v0.16b,v0.16b
146#endif
147	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
148	b.lo	Lodd_tail_v8		//x3 was less than 32
149	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
150#ifndef __ARMEB__
151	rev64	v17.16b,v17.16b
152#endif
153	ext	v7.16b,v17.16b,v17.16b,#8
154	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
155	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
156	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
157	pmull2	v6.1q,v20.2d,v7.2d
158	b	Loop_mod2x_v8
159
160.align	4
161Loop_mod2x_v8:
162	ext	v18.16b,v3.16b,v3.16b,#8
163	subs	x3,x3,#32		//is there more data?
164	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
165	csel	x12,xzr,x12,lo			//is it time to zero x12?
166
167	pmull	v5.1q,v21.1d,v17.1d
168	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
169	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
170	eor	v0.16b,v0.16b,v4.16b		//accumulate
171	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
172	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
173
174	eor	v2.16b,v2.16b,v6.16b
175	csel	x12,xzr,x12,eq			//is it time to zero x12?
176	eor	v1.16b,v1.16b,v5.16b
177
178	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
179	eor	v18.16b,v0.16b,v2.16b
180	eor	v1.16b,v1.16b,v17.16b
181	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
182#ifndef __ARMEB__
183	rev64	v16.16b,v16.16b
184#endif
185	eor	v1.16b,v1.16b,v18.16b
186	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
187
188#ifndef __ARMEB__
189	rev64	v17.16b,v17.16b
190#endif
191	ins	v2.d[0],v1.d[1]
192	ins	v1.d[1],v0.d[0]
193	ext	v7.16b,v17.16b,v17.16b,#8
194	ext	v3.16b,v16.16b,v16.16b,#8
195	eor	v0.16b,v1.16b,v18.16b
196	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
197	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
198
199	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
200	pmull	v0.1q,v0.1d,v19.1d
201	eor	v3.16b,v3.16b,v18.16b
202	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
203	eor	v3.16b,v3.16b,v0.16b
204	pmull2	v6.1q,v20.2d,v7.2d
205	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
206
207	eor	v2.16b,v2.16b,v18.16b
208	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
209	adds	x3,x3,#32		//re-construct x3
210	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
211	b.eq	Ldone_v8		//is x3 zero?
212Lodd_tail_v8:
213	ext	v18.16b,v0.16b,v0.16b,#8
214	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
215	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
216
217	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
218	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
219	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
220	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
221
222	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
223	eor	v18.16b,v0.16b,v2.16b
224	eor	v1.16b,v1.16b,v17.16b
225	eor	v1.16b,v1.16b,v18.16b
226	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
227
228	ins	v2.d[0],v1.d[1]
229	ins	v1.d[1],v0.d[0]
230	eor	v0.16b,v1.16b,v18.16b
231
232	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
233	pmull	v0.1q,v0.1d,v19.1d
234	eor	v18.16b,v18.16b,v2.16b
235	eor	v0.16b,v0.16b,v18.16b
236
237Ldone_v8:
238#ifndef __ARMEB__
239	rev64	v0.16b,v0.16b
240#endif
241	ext	v0.16b,v0.16b,v0.16b,#8
242	st1	{v0.2d},[x0]		//write out Xi
243
244	ret
245
246.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
247.align	2
248.align	2
249#endif  // !OPENSSL_NO_ASM
250