• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18.text
19.arch	armv8-a+crypto
20.globl	gcm_init_v8
21
22.def gcm_init_v8
23   .type 32
24.endef
25.align	4
26gcm_init_v8:
27	AARCH64_VALID_CALL_TARGET
28	ld1	{v17.2d},[x1]		//load input H
29	movi	v19.16b,#0xe1
30	shl	v19.2d,v19.2d,#57		//0xc2.0
31	ext	v3.16b,v17.16b,v17.16b,#8
32	ushr	v18.2d,v19.2d,#63
33	dup	v17.4s,v17.s[1]
34	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
35	ushr	v18.2d,v3.2d,#63
36	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
37	and	v18.16b,v18.16b,v16.16b
38	shl	v3.2d,v3.2d,#1
39	ext	v18.16b,v18.16b,v18.16b,#8
40	and	v16.16b,v16.16b,v17.16b
41	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
42	eor	v20.16b,v3.16b,v16.16b		//twisted H
43	st1	{v20.2d},[x0],#16		//store Htable[0]
44
45	//calculate H^2
46	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
47	pmull	v0.1q,v20.1d,v20.1d
48	eor	v16.16b,v16.16b,v20.16b
49	pmull2	v2.1q,v20.2d,v20.2d
50	pmull	v1.1q,v16.1d,v16.1d
51
52	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
53	eor	v18.16b,v0.16b,v2.16b
54	eor	v1.16b,v1.16b,v17.16b
55	eor	v1.16b,v1.16b,v18.16b
56	pmull	v18.1q,v0.1d,v19.1d		//1st phase
57
58	ins	v2.d[0],v1.d[1]
59	ins	v1.d[1],v0.d[0]
60	eor	v0.16b,v1.16b,v18.16b
61
62	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
63	pmull	v0.1q,v0.1d,v19.1d
64	eor	v18.16b,v18.16b,v2.16b
65	eor	v22.16b,v0.16b,v18.16b
66
67	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
68	eor	v17.16b,v17.16b,v22.16b
69	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
70	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
71
72	ret
73
74.globl	gcm_gmult_v8
75
76.def gcm_gmult_v8
77   .type 32
78.endef
79.align	4
80gcm_gmult_v8:
81	AARCH64_VALID_CALL_TARGET
82	ld1	{v17.2d},[x0]		//load Xi
83	movi	v19.16b,#0xe1
84	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
85	shl	v19.2d,v19.2d,#57
86#ifndef __ARMEB__
87	rev64	v17.16b,v17.16b
88#endif
89	ext	v3.16b,v17.16b,v17.16b,#8
90
91	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
92	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
93	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
94	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
95
96	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
97	eor	v18.16b,v0.16b,v2.16b
98	eor	v1.16b,v1.16b,v17.16b
99	eor	v1.16b,v1.16b,v18.16b
100	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
101
102	ins	v2.d[0],v1.d[1]
103	ins	v1.d[1],v0.d[0]
104	eor	v0.16b,v1.16b,v18.16b
105
106	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
107	pmull	v0.1q,v0.1d,v19.1d
108	eor	v18.16b,v18.16b,v2.16b
109	eor	v0.16b,v0.16b,v18.16b
110
111#ifndef __ARMEB__
112	rev64	v0.16b,v0.16b
113#endif
114	ext	v0.16b,v0.16b,v0.16b,#8
115	st1	{v0.2d},[x0]		//write out Xi
116
117	ret
118
119.globl	gcm_ghash_v8
120
121.def gcm_ghash_v8
122   .type 32
123.endef
124.align	4
125gcm_ghash_v8:
126	AARCH64_VALID_CALL_TARGET
127	ld1	{v0.2d},[x0]		//load [rotated] Xi
128						//"[rotated]" means that
129						//loaded value would have
130						//to be rotated in order to
131						//make it appear as in
132						//algorithm specification
133	subs	x3,x3,#32		//see if x3 is 32 or larger
134	mov	x12,#16		//x12 is used as post-
135						//increment for input pointer;
136						//as loop is modulo-scheduled
137						//x12 is zeroed just in time
138						//to preclude overstepping
139						//inp[len], which means that
140						//last block[s] are actually
141						//loaded twice, but last
142						//copy is not processed
143	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
144	movi	v19.16b,#0xe1
145	ld1	{v22.2d},[x1]
146	csel	x12,xzr,x12,eq			//is it time to zero x12?
147	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
148	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
149	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
150#ifndef __ARMEB__
151	rev64	v16.16b,v16.16b
152	rev64	v0.16b,v0.16b
153#endif
154	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
155	b.lo	Lodd_tail_v8		//x3 was less than 32
156	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
157#ifndef __ARMEB__
158	rev64	v17.16b,v17.16b
159#endif
160	ext	v7.16b,v17.16b,v17.16b,#8
161	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
162	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
163	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
164	pmull2	v6.1q,v20.2d,v7.2d
165	b	Loop_mod2x_v8
166
167.align	4
168Loop_mod2x_v8:
169	ext	v18.16b,v3.16b,v3.16b,#8
170	subs	x3,x3,#32		//is there more data?
171	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
172	csel	x12,xzr,x12,lo			//is it time to zero x12?
173
174	pmull	v5.1q,v21.1d,v17.1d
175	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
176	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
177	eor	v0.16b,v0.16b,v4.16b		//accumulate
178	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
179	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
180
181	eor	v2.16b,v2.16b,v6.16b
182	csel	x12,xzr,x12,eq			//is it time to zero x12?
183	eor	v1.16b,v1.16b,v5.16b
184
185	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
186	eor	v18.16b,v0.16b,v2.16b
187	eor	v1.16b,v1.16b,v17.16b
188	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
189#ifndef __ARMEB__
190	rev64	v16.16b,v16.16b
191#endif
192	eor	v1.16b,v1.16b,v18.16b
193	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
194
195#ifndef __ARMEB__
196	rev64	v17.16b,v17.16b
197#endif
198	ins	v2.d[0],v1.d[1]
199	ins	v1.d[1],v0.d[0]
200	ext	v7.16b,v17.16b,v17.16b,#8
201	ext	v3.16b,v16.16b,v16.16b,#8
202	eor	v0.16b,v1.16b,v18.16b
203	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
204	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
205
206	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
207	pmull	v0.1q,v0.1d,v19.1d
208	eor	v3.16b,v3.16b,v18.16b
209	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
210	eor	v3.16b,v3.16b,v0.16b
211	pmull2	v6.1q,v20.2d,v7.2d
212	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
213
214	eor	v2.16b,v2.16b,v18.16b
215	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
216	adds	x3,x3,#32		//re-construct x3
217	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
218	b.eq	Ldone_v8		//is x3 zero?
219Lodd_tail_v8:
220	ext	v18.16b,v0.16b,v0.16b,#8
221	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
222	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
223
224	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
225	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
226	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
227	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
228
229	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
230	eor	v18.16b,v0.16b,v2.16b
231	eor	v1.16b,v1.16b,v17.16b
232	eor	v1.16b,v1.16b,v18.16b
233	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
234
235	ins	v2.d[0],v1.d[1]
236	ins	v1.d[1],v0.d[0]
237	eor	v0.16b,v1.16b,v18.16b
238
239	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
240	pmull	v0.1q,v0.1d,v19.1d
241	eor	v18.16b,v18.16b,v2.16b
242	eor	v0.16b,v0.16b,v18.16b
243
244Ldone_v8:
245#ifndef __ARMEB__
246	rev64	v0.16b,v0.16b
247#endif
248	ext	v0.16b,v0.16b,v0.16b,#8
249	st1	{v0.2d},[x0]		//write out Xi
250
251	ret
252
253.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
254.align	2
255.align	2
256#endif
257#endif  // !OPENSSL_NO_ASM
258