• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include "ring_core_generated/prefix_symbols_asm.h"
14#include <ring-core/arm_arch.h>
15
16.text
17
18.globl	gcm_init_neon
19.hidden	gcm_init_neon
20.type	gcm_init_neon,%function
21.align	4
22gcm_init_neon:
23	AARCH64_VALID_CALL_TARGET
24	// This function is adapted from gcm_init_v8. xC2 is t3.
25	ld1	{v17.2d}, [x1]			// load H
26	movi	v19.16b, #0xe1
27	shl	v19.2d, v19.2d, #57		// 0xc2.0
28	ext	v3.16b, v17.16b, v17.16b, #8
29	ushr	v18.2d, v19.2d, #63
30	dup	v17.4s, v17.s[1]
31	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
32	ushr	v18.2d, v3.2d, #63
33	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
34	and	v18.16b, v18.16b, v16.16b
35	shl	v3.2d, v3.2d, #1
36	ext	v18.16b, v18.16b, v18.16b, #8
37	and	v16.16b, v16.16b, v17.16b
38	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
39	eor	v5.16b, v3.16b, v16.16b	// twisted H
40	st1	{v5.2d}, [x0]			// store Htable[0]
41	ret
42.size	gcm_init_neon,.-gcm_init_neon
43
44.globl	gcm_gmult_neon
45.hidden	gcm_gmult_neon
46.type	gcm_gmult_neon,%function
47.align	4
48gcm_gmult_neon:
49	AARCH64_VALID_CALL_TARGET
50	ld1	{v3.16b}, [x0]		// load Xi
51	ld1	{v5.1d}, [x1], #8		// load twisted H
52	ld1	{v6.1d}, [x1]
53	adrp	x9, .Lmasks		// load constants
54	add	x9, x9, :lo12:.Lmasks
55	ld1	{v24.2d, v25.2d}, [x9]
56	rev64	v3.16b, v3.16b		// byteswap Xi
57	ext	v3.16b, v3.16b, v3.16b, #8
58	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
59
60	mov	x3, #16
61	b	.Lgmult_neon
62.size	gcm_gmult_neon,.-gcm_gmult_neon
63
64.globl	gcm_ghash_neon
65.hidden	gcm_ghash_neon
66.type	gcm_ghash_neon,%function
67.align	4
68gcm_ghash_neon:
69	AARCH64_VALID_CALL_TARGET
70	ld1	{v0.16b}, [x0]		// load Xi
71	ld1	{v5.1d}, [x1], #8		// load twisted H
72	ld1	{v6.1d}, [x1]
73	adrp	x9, .Lmasks		// load constants
74	add	x9, x9, :lo12:.Lmasks
75	ld1	{v24.2d, v25.2d}, [x9]
76	rev64	v0.16b, v0.16b		// byteswap Xi
77	ext	v0.16b, v0.16b, v0.16b, #8
78	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
79
80.Loop_neon:
81	ld1	{v3.16b}, [x2], #16	// load inp
82	rev64	v3.16b, v3.16b		// byteswap inp
83	ext	v3.16b, v3.16b, v3.16b, #8
84	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
85
86.Lgmult_neon:
87	// Split the input into v3 and v4. (The upper halves are unused,
88	// so it is okay to leave them alone.)
89	ins	v4.d[0], v3.d[1]
90	ext	v16.8b, v5.8b, v5.8b, #1	// A1
91	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
92	ext	v0.8b, v3.8b, v3.8b, #1		// B1
93	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
94	ext	v17.8b, v5.8b, v5.8b, #2	// A2
95	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
96	ext	v19.8b, v3.8b, v3.8b, #2	// B2
97	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
98	ext	v18.8b, v5.8b, v5.8b, #3	// A3
99	eor	v16.16b, v16.16b, v0.16b	// L = E + F
100	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
101	ext	v0.8b, v3.8b, v3.8b, #3		// B3
102	eor	v17.16b, v17.16b, v19.16b	// M = G + H
103	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
104
105	// Here we diverge from the 32-bit version. It computes the following
106	// (instructions reordered for clarity):
107	//
108	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
109	//     vand	$t0#hi, $t0#hi, $k48
110	//     veor	$t0#lo, $t0#lo, $t0#hi
111	//
112	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
113	//     vand	$t1#hi, $t1#hi, $k32
114	//     veor	$t1#lo, $t1#lo, $t1#hi
115	//
116	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
117	//     vand	$t2#hi, $t2#hi, $k16
118	//     veor	$t2#lo, $t2#lo, $t2#hi
119	//
120	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
121	//     vmov.i64	$t3#hi, #0
122	//
123	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
124	// upper halves of SIMD registers, so we must split each half into
125	// separate registers. To compensate, we pair computations up and
126	// parallelize.
127
128	ext	v19.8b, v3.8b, v3.8b, #4	// B4
129	eor	v18.16b, v18.16b, v0.16b	// N = I + J
130	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
131
132	// This can probably be scheduled more efficiently. For now, we just
133	// pair up independent instructions.
134	zip1	v20.2d, v16.2d, v17.2d
135	zip1	v22.2d, v18.2d, v19.2d
136	zip2	v21.2d, v16.2d, v17.2d
137	zip2	v23.2d, v18.2d, v19.2d
138	eor	v20.16b, v20.16b, v21.16b
139	eor	v22.16b, v22.16b, v23.16b
140	and	v21.16b, v21.16b, v24.16b
141	and	v23.16b, v23.16b, v25.16b
142	eor	v20.16b, v20.16b, v21.16b
143	eor	v22.16b, v22.16b, v23.16b
144	zip1	v16.2d, v20.2d, v21.2d
145	zip1	v18.2d, v22.2d, v23.2d
146	zip2	v17.2d, v20.2d, v21.2d
147	zip2	v19.2d, v22.2d, v23.2d
148
149	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
150	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
151	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
152	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
153	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
154	eor	v16.16b, v16.16b, v17.16b
155	eor	v18.16b, v18.16b, v19.16b
156	eor	v0.16b, v0.16b, v16.16b
157	eor	v0.16b, v0.16b, v18.16b
158	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
159	ext	v16.8b, v7.8b, v7.8b, #1	// A1
160	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
161	ext	v1.8b, v3.8b, v3.8b, #1		// B1
162	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
163	ext	v17.8b, v7.8b, v7.8b, #2	// A2
164	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
165	ext	v19.8b, v3.8b, v3.8b, #2	// B2
166	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
167	ext	v18.8b, v7.8b, v7.8b, #3	// A3
168	eor	v16.16b, v16.16b, v1.16b	// L = E + F
169	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
170	ext	v1.8b, v3.8b, v3.8b, #3		// B3
171	eor	v17.16b, v17.16b, v19.16b	// M = G + H
172	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
173
174	// Here we diverge from the 32-bit version. It computes the following
175	// (instructions reordered for clarity):
176	//
177	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
178	//     vand	$t0#hi, $t0#hi, $k48
179	//     veor	$t0#lo, $t0#lo, $t0#hi
180	//
181	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
182	//     vand	$t1#hi, $t1#hi, $k32
183	//     veor	$t1#lo, $t1#lo, $t1#hi
184	//
185	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
186	//     vand	$t2#hi, $t2#hi, $k16
187	//     veor	$t2#lo, $t2#lo, $t2#hi
188	//
189	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
190	//     vmov.i64	$t3#hi, #0
191	//
192	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
193	// upper halves of SIMD registers, so we must split each half into
194	// separate registers. To compensate, we pair computations up and
195	// parallelize.
196
197	ext	v19.8b, v3.8b, v3.8b, #4	// B4
198	eor	v18.16b, v18.16b, v1.16b	// N = I + J
199	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
200
201	// This can probably be scheduled more efficiently. For now, we just
202	// pair up independent instructions.
203	zip1	v20.2d, v16.2d, v17.2d
204	zip1	v22.2d, v18.2d, v19.2d
205	zip2	v21.2d, v16.2d, v17.2d
206	zip2	v23.2d, v18.2d, v19.2d
207	eor	v20.16b, v20.16b, v21.16b
208	eor	v22.16b, v22.16b, v23.16b
209	and	v21.16b, v21.16b, v24.16b
210	and	v23.16b, v23.16b, v25.16b
211	eor	v20.16b, v20.16b, v21.16b
212	eor	v22.16b, v22.16b, v23.16b
213	zip1	v16.2d, v20.2d, v21.2d
214	zip1	v18.2d, v22.2d, v23.2d
215	zip2	v17.2d, v20.2d, v21.2d
216	zip2	v19.2d, v22.2d, v23.2d
217
218	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
219	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
220	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
221	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
222	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
223	eor	v16.16b, v16.16b, v17.16b
224	eor	v18.16b, v18.16b, v19.16b
225	eor	v1.16b, v1.16b, v16.16b
226	eor	v1.16b, v1.16b, v18.16b
227	ext	v16.8b, v6.8b, v6.8b, #1	// A1
228	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
229	ext	v2.8b, v4.8b, v4.8b, #1		// B1
230	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
231	ext	v17.8b, v6.8b, v6.8b, #2	// A2
232	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
233	ext	v19.8b, v4.8b, v4.8b, #2	// B2
234	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
235	ext	v18.8b, v6.8b, v6.8b, #3	// A3
236	eor	v16.16b, v16.16b, v2.16b	// L = E + F
237	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
238	ext	v2.8b, v4.8b, v4.8b, #3		// B3
239	eor	v17.16b, v17.16b, v19.16b	// M = G + H
240	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
241
242	// Here we diverge from the 32-bit version. It computes the following
243	// (instructions reordered for clarity):
244	//
245	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
246	//     vand	$t0#hi, $t0#hi, $k48
247	//     veor	$t0#lo, $t0#lo, $t0#hi
248	//
249	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
250	//     vand	$t1#hi, $t1#hi, $k32
251	//     veor	$t1#lo, $t1#lo, $t1#hi
252	//
253	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
254	//     vand	$t2#hi, $t2#hi, $k16
255	//     veor	$t2#lo, $t2#lo, $t2#hi
256	//
257	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
258	//     vmov.i64	$t3#hi, #0
259	//
260	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
261	// upper halves of SIMD registers, so we must split each half into
262	// separate registers. To compensate, we pair computations up and
263	// parallelize.
264
265	ext	v19.8b, v4.8b, v4.8b, #4	// B4
266	eor	v18.16b, v18.16b, v2.16b	// N = I + J
267	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
268
269	// This can probably be scheduled more efficiently. For now, we just
270	// pair up independent instructions.
271	zip1	v20.2d, v16.2d, v17.2d
272	zip1	v22.2d, v18.2d, v19.2d
273	zip2	v21.2d, v16.2d, v17.2d
274	zip2	v23.2d, v18.2d, v19.2d
275	eor	v20.16b, v20.16b, v21.16b
276	eor	v22.16b, v22.16b, v23.16b
277	and	v21.16b, v21.16b, v24.16b
278	and	v23.16b, v23.16b, v25.16b
279	eor	v20.16b, v20.16b, v21.16b
280	eor	v22.16b, v22.16b, v23.16b
281	zip1	v16.2d, v20.2d, v21.2d
282	zip1	v18.2d, v22.2d, v23.2d
283	zip2	v17.2d, v20.2d, v21.2d
284	zip2	v19.2d, v22.2d, v23.2d
285
286	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
287	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
288	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
289	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
290	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
291	eor	v16.16b, v16.16b, v17.16b
292	eor	v18.16b, v18.16b, v19.16b
293	eor	v2.16b, v2.16b, v16.16b
294	eor	v2.16b, v2.16b, v18.16b
295	ext	v16.16b, v0.16b, v2.16b, #8
296	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
297	eor	v1.16b, v1.16b, v2.16b
298	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
299	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
300	// This is a no-op due to the ins instruction below.
301	// ins	v2.d[0], v1.d[1]
302
303	// equivalent of reduction_avx from ghash-x86_64.pl
304	shl	v17.2d, v0.2d, #57		// 1st phase
305	shl	v18.2d, v0.2d, #62
306	eor	v18.16b, v18.16b, v17.16b	//
307	shl	v17.2d, v0.2d, #63
308	eor	v18.16b, v18.16b, v17.16b	//
309	// Note Xm contains {Xl.d[1], Xh.d[0]}.
310	eor	v18.16b, v18.16b, v1.16b
311	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
312	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
313
314	ushr	v18.2d, v0.2d, #1		// 2nd phase
315	eor	v2.16b, v2.16b,v0.16b
316	eor	v0.16b, v0.16b,v18.16b	//
317	ushr	v18.2d, v18.2d, #6
318	ushr	v0.2d, v0.2d, #1		//
319	eor	v0.16b, v0.16b, v2.16b	//
320	eor	v0.16b, v0.16b, v18.16b	//
321
322	subs	x3, x3, #16
323	bne	.Loop_neon
324
325	rev64	v0.16b, v0.16b		// byteswap Xi and write
326	ext	v0.16b, v0.16b, v0.16b, #8
327	st1	{v0.16b}, [x0]
328
329	ret
330.size	gcm_ghash_neon,.-gcm_ghash_neon
331
332.section	.rodata
333.align	4
334.Lmasks:
335.quad	0x0000ffffffffffff	// k48
336.quad	0x00000000ffffffff	// k32
337.quad	0x000000000000ffff	// k16
338.quad	0x0000000000000000	// k0
339.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
340.align	2
341.align	2
342#endif
343#endif  // !OPENSSL_NO_ASM
344.section	.note.GNU-stack,"",%progbits
345