• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(_WIN32)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17.text
18
19.globl	gcm_init_neon
20
21.def gcm_init_neon
22   .type 32
23.endef
24.align	4
25gcm_init_neon:
26	AARCH64_VALID_CALL_TARGET
27	// This function is adapted from gcm_init_v8. xC2 is t3.
28	ld1	{v17.2d}, [x1]			// load H
29	movi	v19.16b, #0xe1
30	shl	v19.2d, v19.2d, #57		// 0xc2.0
31	ext	v3.16b, v17.16b, v17.16b, #8
32	ushr	v18.2d, v19.2d, #63
33	dup	v17.4s, v17.s[1]
34	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
35	ushr	v18.2d, v3.2d, #63
36	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
37	and	v18.16b, v18.16b, v16.16b
38	shl	v3.2d, v3.2d, #1
39	ext	v18.16b, v18.16b, v18.16b, #8
40	and	v16.16b, v16.16b, v17.16b
41	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
42	eor	v5.16b, v3.16b, v16.16b	// twisted H
43	st1	{v5.2d}, [x0]			// store Htable[0]
44	ret
45
46
47.globl	gcm_gmult_neon
48
49.def gcm_gmult_neon
50   .type 32
51.endef
52.align	4
53gcm_gmult_neon:
54	AARCH64_VALID_CALL_TARGET
55	ld1	{v3.16b}, [x0]		// load Xi
56	ld1	{v5.1d}, [x1], #8		// load twisted H
57	ld1	{v6.1d}, [x1]
58	adrp	x9, Lmasks		// load constants
59	add	x9, x9, :lo12:Lmasks
60	ld1	{v24.2d, v25.2d}, [x9]
61	rev64	v3.16b, v3.16b		// byteswap Xi
62	ext	v3.16b, v3.16b, v3.16b, #8
63	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
64
65	mov	x3, #16
66	b	Lgmult_neon
67
68
69.globl	gcm_ghash_neon
70
71.def gcm_ghash_neon
72   .type 32
73.endef
74.align	4
75gcm_ghash_neon:
76	AARCH64_VALID_CALL_TARGET
77	ld1	{v0.16b}, [x0]		// load Xi
78	ld1	{v5.1d}, [x1], #8		// load twisted H
79	ld1	{v6.1d}, [x1]
80	adrp	x9, Lmasks		// load constants
81	add	x9, x9, :lo12:Lmasks
82	ld1	{v24.2d, v25.2d}, [x9]
83	rev64	v0.16b, v0.16b		// byteswap Xi
84	ext	v0.16b, v0.16b, v0.16b, #8
85	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
86
87Loop_neon:
88	ld1	{v3.16b}, [x2], #16	// load inp
89	rev64	v3.16b, v3.16b		// byteswap inp
90	ext	v3.16b, v3.16b, v3.16b, #8
91	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
92
93Lgmult_neon:
94	// Split the input into v3 and v4. (The upper halves are unused,
95	// so it is okay to leave them alone.)
96	ins	v4.d[0], v3.d[1]
97	ext	v16.8b, v5.8b, v5.8b, #1	// A1
98	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
99	ext	v0.8b, v3.8b, v3.8b, #1		// B1
100	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
101	ext	v17.8b, v5.8b, v5.8b, #2	// A2
102	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
103	ext	v19.8b, v3.8b, v3.8b, #2	// B2
104	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
105	ext	v18.8b, v5.8b, v5.8b, #3	// A3
106	eor	v16.16b, v16.16b, v0.16b	// L = E + F
107	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
108	ext	v0.8b, v3.8b, v3.8b, #3		// B3
109	eor	v17.16b, v17.16b, v19.16b	// M = G + H
110	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
111
112	// Here we diverge from the 32-bit version. It computes the following
113	// (instructions reordered for clarity):
114	//
115	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
116	//     vand	$t0#hi, $t0#hi, $k48
117	//     veor	$t0#lo, $t0#lo, $t0#hi
118	//
119	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
120	//     vand	$t1#hi, $t1#hi, $k32
121	//     veor	$t1#lo, $t1#lo, $t1#hi
122	//
123	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
124	//     vand	$t2#hi, $t2#hi, $k16
125	//     veor	$t2#lo, $t2#lo, $t2#hi
126	//
127	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
128	//     vmov.i64	$t3#hi, #0
129	//
130	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
131	// upper halves of SIMD registers, so we must split each half into
132	// separate registers. To compensate, we pair computations up and
133	// parallelize.
134
135	ext	v19.8b, v3.8b, v3.8b, #4	// B4
136	eor	v18.16b, v18.16b, v0.16b	// N = I + J
137	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
138
139	// This can probably be scheduled more efficiently. For now, we just
140	// pair up independent instructions.
141	zip1	v20.2d, v16.2d, v17.2d
142	zip1	v22.2d, v18.2d, v19.2d
143	zip2	v21.2d, v16.2d, v17.2d
144	zip2	v23.2d, v18.2d, v19.2d
145	eor	v20.16b, v20.16b, v21.16b
146	eor	v22.16b, v22.16b, v23.16b
147	and	v21.16b, v21.16b, v24.16b
148	and	v23.16b, v23.16b, v25.16b
149	eor	v20.16b, v20.16b, v21.16b
150	eor	v22.16b, v22.16b, v23.16b
151	zip1	v16.2d, v20.2d, v21.2d
152	zip1	v18.2d, v22.2d, v23.2d
153	zip2	v17.2d, v20.2d, v21.2d
154	zip2	v19.2d, v22.2d, v23.2d
155
156	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
157	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
158	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
159	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
160	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
161	eor	v16.16b, v16.16b, v17.16b
162	eor	v18.16b, v18.16b, v19.16b
163	eor	v0.16b, v0.16b, v16.16b
164	eor	v0.16b, v0.16b, v18.16b
165	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
166	ext	v16.8b, v7.8b, v7.8b, #1	// A1
167	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
168	ext	v1.8b, v3.8b, v3.8b, #1		// B1
169	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
170	ext	v17.8b, v7.8b, v7.8b, #2	// A2
171	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
172	ext	v19.8b, v3.8b, v3.8b, #2	// B2
173	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
174	ext	v18.8b, v7.8b, v7.8b, #3	// A3
175	eor	v16.16b, v16.16b, v1.16b	// L = E + F
176	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
177	ext	v1.8b, v3.8b, v3.8b, #3		// B3
178	eor	v17.16b, v17.16b, v19.16b	// M = G + H
179	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
180
181	// Here we diverge from the 32-bit version. It computes the following
182	// (instructions reordered for clarity):
183	//
184	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
185	//     vand	$t0#hi, $t0#hi, $k48
186	//     veor	$t0#lo, $t0#lo, $t0#hi
187	//
188	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
189	//     vand	$t1#hi, $t1#hi, $k32
190	//     veor	$t1#lo, $t1#lo, $t1#hi
191	//
192	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
193	//     vand	$t2#hi, $t2#hi, $k16
194	//     veor	$t2#lo, $t2#lo, $t2#hi
195	//
196	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
197	//     vmov.i64	$t3#hi, #0
198	//
199	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
200	// upper halves of SIMD registers, so we must split each half into
201	// separate registers. To compensate, we pair computations up and
202	// parallelize.
203
204	ext	v19.8b, v3.8b, v3.8b, #4	// B4
205	eor	v18.16b, v18.16b, v1.16b	// N = I + J
206	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
207
208	// This can probably be scheduled more efficiently. For now, we just
209	// pair up independent instructions.
210	zip1	v20.2d, v16.2d, v17.2d
211	zip1	v22.2d, v18.2d, v19.2d
212	zip2	v21.2d, v16.2d, v17.2d
213	zip2	v23.2d, v18.2d, v19.2d
214	eor	v20.16b, v20.16b, v21.16b
215	eor	v22.16b, v22.16b, v23.16b
216	and	v21.16b, v21.16b, v24.16b
217	and	v23.16b, v23.16b, v25.16b
218	eor	v20.16b, v20.16b, v21.16b
219	eor	v22.16b, v22.16b, v23.16b
220	zip1	v16.2d, v20.2d, v21.2d
221	zip1	v18.2d, v22.2d, v23.2d
222	zip2	v17.2d, v20.2d, v21.2d
223	zip2	v19.2d, v22.2d, v23.2d
224
225	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
226	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
227	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
228	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
229	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
230	eor	v16.16b, v16.16b, v17.16b
231	eor	v18.16b, v18.16b, v19.16b
232	eor	v1.16b, v1.16b, v16.16b
233	eor	v1.16b, v1.16b, v18.16b
234	ext	v16.8b, v6.8b, v6.8b, #1	// A1
235	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
236	ext	v2.8b, v4.8b, v4.8b, #1		// B1
237	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
238	ext	v17.8b, v6.8b, v6.8b, #2	// A2
239	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
240	ext	v19.8b, v4.8b, v4.8b, #2	// B2
241	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
242	ext	v18.8b, v6.8b, v6.8b, #3	// A3
243	eor	v16.16b, v16.16b, v2.16b	// L = E + F
244	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
245	ext	v2.8b, v4.8b, v4.8b, #3		// B3
246	eor	v17.16b, v17.16b, v19.16b	// M = G + H
247	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
248
249	// Here we diverge from the 32-bit version. It computes the following
250	// (instructions reordered for clarity):
251	//
252	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
253	//     vand	$t0#hi, $t0#hi, $k48
254	//     veor	$t0#lo, $t0#lo, $t0#hi
255	//
256	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
257	//     vand	$t1#hi, $t1#hi, $k32
258	//     veor	$t1#lo, $t1#lo, $t1#hi
259	//
260	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
261	//     vand	$t2#hi, $t2#hi, $k16
262	//     veor	$t2#lo, $t2#lo, $t2#hi
263	//
264	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
265	//     vmov.i64	$t3#hi, #0
266	//
267	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
268	// upper halves of SIMD registers, so we must split each half into
269	// separate registers. To compensate, we pair computations up and
270	// parallelize.
271
272	ext	v19.8b, v4.8b, v4.8b, #4	// B4
273	eor	v18.16b, v18.16b, v2.16b	// N = I + J
274	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
275
276	// This can probably be scheduled more efficiently. For now, we just
277	// pair up independent instructions.
278	zip1	v20.2d, v16.2d, v17.2d
279	zip1	v22.2d, v18.2d, v19.2d
280	zip2	v21.2d, v16.2d, v17.2d
281	zip2	v23.2d, v18.2d, v19.2d
282	eor	v20.16b, v20.16b, v21.16b
283	eor	v22.16b, v22.16b, v23.16b
284	and	v21.16b, v21.16b, v24.16b
285	and	v23.16b, v23.16b, v25.16b
286	eor	v20.16b, v20.16b, v21.16b
287	eor	v22.16b, v22.16b, v23.16b
288	zip1	v16.2d, v20.2d, v21.2d
289	zip1	v18.2d, v22.2d, v23.2d
290	zip2	v17.2d, v20.2d, v21.2d
291	zip2	v19.2d, v22.2d, v23.2d
292
293	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
294	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
295	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
296	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
297	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
298	eor	v16.16b, v16.16b, v17.16b
299	eor	v18.16b, v18.16b, v19.16b
300	eor	v2.16b, v2.16b, v16.16b
301	eor	v2.16b, v2.16b, v18.16b
302	ext	v16.16b, v0.16b, v2.16b, #8
303	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
304	eor	v1.16b, v1.16b, v2.16b
305	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
306	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
307	// This is a no-op due to the ins instruction below.
308	// ins	v2.d[0], v1.d[1]
309
310	// equivalent of reduction_avx from ghash-x86_64.pl
311	shl	v17.2d, v0.2d, #57		// 1st phase
312	shl	v18.2d, v0.2d, #62
313	eor	v18.16b, v18.16b, v17.16b	//
314	shl	v17.2d, v0.2d, #63
315	eor	v18.16b, v18.16b, v17.16b	//
316	// Note Xm contains {Xl.d[1], Xh.d[0]}.
317	eor	v18.16b, v18.16b, v1.16b
318	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
319	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
320
321	ushr	v18.2d, v0.2d, #1		// 2nd phase
322	eor	v2.16b, v2.16b,v0.16b
323	eor	v0.16b, v0.16b,v18.16b	//
324	ushr	v18.2d, v18.2d, #6
325	ushr	v0.2d, v0.2d, #1		//
326	eor	v0.16b, v0.16b, v2.16b	//
327	eor	v0.16b, v0.16b, v18.16b	//
328
329	subs	x3, x3, #16
330	bne	Loop_neon
331
332	rev64	v0.16b, v0.16b		// byteswap Xi and write
333	ext	v0.16b, v0.16b, v0.16b, #8
334	st1	{v0.16b}, [x0]
335
336	ret
337
338
339.section	.rodata
340.align	4
341Lmasks:
342.quad	0x0000ffffffffffff	// k48
343.quad	0x00000000ffffffff	// k32
344.quad	0x000000000000ffff	// k16
345.quad	0x0000000000000000	// k0
346.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
347.align	2
348.align	2
349#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(_WIN32)
350#if defined(__ELF__)
351// See https://www.airs.com/blog/archives/518.
352.section .note.GNU-stack,"",%progbits
353#endif
354