• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include "ring_core_generated/prefix_symbols_asm.h"
14#include <ring-core/arm_arch.h>
15
16.text
17.arch	armv8-a+crypto
18.globl	gcm_init_clmul
19.hidden	gcm_init_clmul
20.type	gcm_init_clmul,%function
21.align	4
22gcm_init_clmul:
23	AARCH64_VALID_CALL_TARGET
24	ld1	{v17.2d},[x1]		//load input H
25	movi	v19.16b,#0xe1
26	shl	v19.2d,v19.2d,#57		//0xc2.0
27	ext	v3.16b,v17.16b,v17.16b,#8
28	ushr	v18.2d,v19.2d,#63
29	dup	v17.4s,v17.s[1]
30	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
31	ushr	v18.2d,v3.2d,#63
32	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
33	and	v18.16b,v18.16b,v16.16b
34	shl	v3.2d,v3.2d,#1
35	ext	v18.16b,v18.16b,v18.16b,#8
36	and	v16.16b,v16.16b,v17.16b
37	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
38	eor	v20.16b,v3.16b,v16.16b		//twisted H
39	st1	{v20.2d},[x0],#16		//store Htable[0]
40
41	//calculate H^2
42	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
43	pmull	v0.1q,v20.1d,v20.1d
44	eor	v16.16b,v16.16b,v20.16b
45	pmull2	v2.1q,v20.2d,v20.2d
46	pmull	v1.1q,v16.1d,v16.1d
47
48	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
49	eor	v18.16b,v0.16b,v2.16b
50	eor	v1.16b,v1.16b,v17.16b
51	eor	v1.16b,v1.16b,v18.16b
52	pmull	v18.1q,v0.1d,v19.1d		//1st phase
53
54	ins	v2.d[0],v1.d[1]
55	ins	v1.d[1],v0.d[0]
56	eor	v0.16b,v1.16b,v18.16b
57
58	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
59	pmull	v0.1q,v0.1d,v19.1d
60	eor	v18.16b,v18.16b,v2.16b
61	eor	v22.16b,v0.16b,v18.16b
62
63	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
64	eor	v17.16b,v17.16b,v22.16b
65	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
66	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
67
68	ret
69.size	gcm_init_clmul,.-gcm_init_clmul
70.globl	gcm_gmult_clmul
71.hidden	gcm_gmult_clmul
72.type	gcm_gmult_clmul,%function
73.align	4
74gcm_gmult_clmul:
75	AARCH64_VALID_CALL_TARGET
76	ld1	{v17.2d},[x0]		//load Xi
77	movi	v19.16b,#0xe1
78	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
79	shl	v19.2d,v19.2d,#57
80#ifndef __ARMEB__
81	rev64	v17.16b,v17.16b
82#endif
83	ext	v3.16b,v17.16b,v17.16b,#8
84
85	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
86	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
87	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
88	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
89
90	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
91	eor	v18.16b,v0.16b,v2.16b
92	eor	v1.16b,v1.16b,v17.16b
93	eor	v1.16b,v1.16b,v18.16b
94	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
95
96	ins	v2.d[0],v1.d[1]
97	ins	v1.d[1],v0.d[0]
98	eor	v0.16b,v1.16b,v18.16b
99
100	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
101	pmull	v0.1q,v0.1d,v19.1d
102	eor	v18.16b,v18.16b,v2.16b
103	eor	v0.16b,v0.16b,v18.16b
104
105#ifndef __ARMEB__
106	rev64	v0.16b,v0.16b
107#endif
108	ext	v0.16b,v0.16b,v0.16b,#8
109	st1	{v0.2d},[x0]		//write out Xi
110
111	ret
112.size	gcm_gmult_clmul,.-gcm_gmult_clmul
113.globl	gcm_ghash_clmul
114.hidden	gcm_ghash_clmul
115.type	gcm_ghash_clmul,%function
116.align	4
117gcm_ghash_clmul:
118	AARCH64_VALID_CALL_TARGET
119	ld1	{v0.2d},[x0]		//load [rotated] Xi
120						//"[rotated]" means that
121						//loaded value would have
122						//to be rotated in order to
123						//make it appear as in
124						//algorithm specification
125	subs	x3,x3,#32		//see if x3 is 32 or larger
126	mov	x12,#16		//x12 is used as post-
127						//increment for input pointer;
128						//as loop is modulo-scheduled
129						//x12 is zeroed just in time
130						//to preclude overstepping
131						//inp[len], which means that
132						//last block[s] are actually
133						//loaded twice, but last
134						//copy is not processed
135	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
136	movi	v19.16b,#0xe1
137	ld1	{v22.2d},[x1]
138	csel	x12,xzr,x12,eq			//is it time to zero x12?
139	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
140	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
141	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
142#ifndef __ARMEB__
143	rev64	v16.16b,v16.16b
144	rev64	v0.16b,v0.16b
145#endif
146	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
147	b.lo	.Lodd_tail_v8		//x3 was less than 32
148	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
149#ifndef __ARMEB__
150	rev64	v17.16b,v17.16b
151#endif
152	ext	v7.16b,v17.16b,v17.16b,#8
153	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
154	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
155	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
156	pmull2	v6.1q,v20.2d,v7.2d
157	b	.Loop_mod2x_v8
158
159.align	4
160.Loop_mod2x_v8:
161	ext	v18.16b,v3.16b,v3.16b,#8
162	subs	x3,x3,#32		//is there more data?
163	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
164	csel	x12,xzr,x12,lo			//is it time to zero x12?
165
166	pmull	v5.1q,v21.1d,v17.1d
167	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
168	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
169	eor	v0.16b,v0.16b,v4.16b		//accumulate
170	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
171	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
172
173	eor	v2.16b,v2.16b,v6.16b
174	csel	x12,xzr,x12,eq			//is it time to zero x12?
175	eor	v1.16b,v1.16b,v5.16b
176
177	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
178	eor	v18.16b,v0.16b,v2.16b
179	eor	v1.16b,v1.16b,v17.16b
180	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
181#ifndef __ARMEB__
182	rev64	v16.16b,v16.16b
183#endif
184	eor	v1.16b,v1.16b,v18.16b
185	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
186
187#ifndef __ARMEB__
188	rev64	v17.16b,v17.16b
189#endif
190	ins	v2.d[0],v1.d[1]
191	ins	v1.d[1],v0.d[0]
192	ext	v7.16b,v17.16b,v17.16b,#8
193	ext	v3.16b,v16.16b,v16.16b,#8
194	eor	v0.16b,v1.16b,v18.16b
195	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
196	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
197
198	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
199	pmull	v0.1q,v0.1d,v19.1d
200	eor	v3.16b,v3.16b,v18.16b
201	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
202	eor	v3.16b,v3.16b,v0.16b
203	pmull2	v6.1q,v20.2d,v7.2d
204	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
205
206	eor	v2.16b,v2.16b,v18.16b
207	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
208	adds	x3,x3,#32		//re-construct x3
209	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
210	b.eq	.Ldone_v8		//is x3 zero?
211.Lodd_tail_v8:
212	ext	v18.16b,v0.16b,v0.16b,#8
213	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
214	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
215
216	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
217	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
218	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
219	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
220
221	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
222	eor	v18.16b,v0.16b,v2.16b
223	eor	v1.16b,v1.16b,v17.16b
224	eor	v1.16b,v1.16b,v18.16b
225	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
226
227	ins	v2.d[0],v1.d[1]
228	ins	v1.d[1],v0.d[0]
229	eor	v0.16b,v1.16b,v18.16b
230
231	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
232	pmull	v0.1q,v0.1d,v19.1d
233	eor	v18.16b,v18.16b,v2.16b
234	eor	v0.16b,v0.16b,v18.16b
235
236.Ldone_v8:
237#ifndef __ARMEB__
238	rev64	v0.16b,v0.16b
239#endif
240	ext	v0.16b,v0.16b,v0.16b,#8
241	st1	{v0.2d},[x0]		//write out Xi
242
243	ret
244.size	gcm_ghash_clmul,.-gcm_ghash_clmul
245.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
246.align	2
247.align	2
248#endif
249#endif  // !OPENSSL_NO_ASM
250.section	.note.GNU-stack,"",%progbits
251