• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18.text
19
20.globl	gcm_init_neon
21.hidden	gcm_init_neon
22.type	gcm_init_neon,%function
23.align	4
24gcm_init_neon:
25	AARCH64_VALID_CALL_TARGET
26	// This function is adapted from gcm_init_v8. xC2 is t3.
27	ld1	{v17.2d}, [x1]			// load H
28	movi	v19.16b, #0xe1
29	shl	v19.2d, v19.2d, #57		// 0xc2.0
30	ext	v3.16b, v17.16b, v17.16b, #8
31	ushr	v18.2d, v19.2d, #63
32	dup	v17.4s, v17.s[1]
33	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
34	ushr	v18.2d, v3.2d, #63
35	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
36	and	v18.16b, v18.16b, v16.16b
37	shl	v3.2d, v3.2d, #1
38	ext	v18.16b, v18.16b, v18.16b, #8
39	and	v16.16b, v16.16b, v17.16b
40	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
41	eor	v5.16b, v3.16b, v16.16b	// twisted H
42	st1	{v5.2d}, [x0]			// store Htable[0]
43	ret
44.size	gcm_init_neon,.-gcm_init_neon
45
46.globl	gcm_gmult_neon
47.hidden	gcm_gmult_neon
48.type	gcm_gmult_neon,%function
49.align	4
50gcm_gmult_neon:
51	AARCH64_VALID_CALL_TARGET
52	ld1	{v3.16b}, [x0]		// load Xi
53	ld1	{v5.1d}, [x1], #8		// load twisted H
54	ld1	{v6.1d}, [x1]
55	adrp	x9, .Lmasks		// load constants
56	add	x9, x9, :lo12:.Lmasks
57	ld1	{v24.2d, v25.2d}, [x9]
58	rev64	v3.16b, v3.16b		// byteswap Xi
59	ext	v3.16b, v3.16b, v3.16b, #8
60	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
61
62	mov	x3, #16
63	b	.Lgmult_neon
64.size	gcm_gmult_neon,.-gcm_gmult_neon
65
66.globl	gcm_ghash_neon
67.hidden	gcm_ghash_neon
68.type	gcm_ghash_neon,%function
69.align	4
70gcm_ghash_neon:
71	AARCH64_VALID_CALL_TARGET
72	ld1	{v0.16b}, [x0]		// load Xi
73	ld1	{v5.1d}, [x1], #8		// load twisted H
74	ld1	{v6.1d}, [x1]
75	adrp	x9, .Lmasks		// load constants
76	add	x9, x9, :lo12:.Lmasks
77	ld1	{v24.2d, v25.2d}, [x9]
78	rev64	v0.16b, v0.16b		// byteswap Xi
79	ext	v0.16b, v0.16b, v0.16b, #8
80	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
81
82.Loop_neon:
83	ld1	{v3.16b}, [x2], #16	// load inp
84	rev64	v3.16b, v3.16b		// byteswap inp
85	ext	v3.16b, v3.16b, v3.16b, #8
86	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
87
88.Lgmult_neon:
89	// Split the input into v3 and v4. (The upper halves are unused,
90	// so it is okay to leave them alone.)
91	ins	v4.d[0], v3.d[1]
92	ext	v16.8b, v5.8b, v5.8b, #1	// A1
93	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
94	ext	v0.8b, v3.8b, v3.8b, #1		// B1
95	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
96	ext	v17.8b, v5.8b, v5.8b, #2	// A2
97	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
98	ext	v19.8b, v3.8b, v3.8b, #2	// B2
99	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
100	ext	v18.8b, v5.8b, v5.8b, #3	// A3
101	eor	v16.16b, v16.16b, v0.16b	// L = E + F
102	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
103	ext	v0.8b, v3.8b, v3.8b, #3		// B3
104	eor	v17.16b, v17.16b, v19.16b	// M = G + H
105	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
106
107	// Here we diverge from the 32-bit version. It computes the following
108	// (instructions reordered for clarity):
109	//
110	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
111	//     vand	$t0#hi, $t0#hi, $k48
112	//     veor	$t0#lo, $t0#lo, $t0#hi
113	//
114	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
115	//     vand	$t1#hi, $t1#hi, $k32
116	//     veor	$t1#lo, $t1#lo, $t1#hi
117	//
118	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
119	//     vand	$t2#hi, $t2#hi, $k16
120	//     veor	$t2#lo, $t2#lo, $t2#hi
121	//
122	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
123	//     vmov.i64	$t3#hi, #0
124	//
125	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
126	// upper halves of SIMD registers, so we must split each half into
127	// separate registers. To compensate, we pair computations up and
128	// parallelize.
129
130	ext	v19.8b, v3.8b, v3.8b, #4	// B4
131	eor	v18.16b, v18.16b, v0.16b	// N = I + J
132	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
133
134	// This can probably be scheduled more efficiently. For now, we just
135	// pair up independent instructions.
136	zip1	v20.2d, v16.2d, v17.2d
137	zip1	v22.2d, v18.2d, v19.2d
138	zip2	v21.2d, v16.2d, v17.2d
139	zip2	v23.2d, v18.2d, v19.2d
140	eor	v20.16b, v20.16b, v21.16b
141	eor	v22.16b, v22.16b, v23.16b
142	and	v21.16b, v21.16b, v24.16b
143	and	v23.16b, v23.16b, v25.16b
144	eor	v20.16b, v20.16b, v21.16b
145	eor	v22.16b, v22.16b, v23.16b
146	zip1	v16.2d, v20.2d, v21.2d
147	zip1	v18.2d, v22.2d, v23.2d
148	zip2	v17.2d, v20.2d, v21.2d
149	zip2	v19.2d, v22.2d, v23.2d
150
151	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
152	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
153	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
154	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
155	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
156	eor	v16.16b, v16.16b, v17.16b
157	eor	v18.16b, v18.16b, v19.16b
158	eor	v0.16b, v0.16b, v16.16b
159	eor	v0.16b, v0.16b, v18.16b
160	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
161	ext	v16.8b, v7.8b, v7.8b, #1	// A1
162	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
163	ext	v1.8b, v3.8b, v3.8b, #1		// B1
164	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
165	ext	v17.8b, v7.8b, v7.8b, #2	// A2
166	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
167	ext	v19.8b, v3.8b, v3.8b, #2	// B2
168	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
169	ext	v18.8b, v7.8b, v7.8b, #3	// A3
170	eor	v16.16b, v16.16b, v1.16b	// L = E + F
171	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
172	ext	v1.8b, v3.8b, v3.8b, #3		// B3
173	eor	v17.16b, v17.16b, v19.16b	// M = G + H
174	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
175
176	// Here we diverge from the 32-bit version. It computes the following
177	// (instructions reordered for clarity):
178	//
179	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
180	//     vand	$t0#hi, $t0#hi, $k48
181	//     veor	$t0#lo, $t0#lo, $t0#hi
182	//
183	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
184	//     vand	$t1#hi, $t1#hi, $k32
185	//     veor	$t1#lo, $t1#lo, $t1#hi
186	//
187	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
188	//     vand	$t2#hi, $t2#hi, $k16
189	//     veor	$t2#lo, $t2#lo, $t2#hi
190	//
191	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
192	//     vmov.i64	$t3#hi, #0
193	//
194	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
195	// upper halves of SIMD registers, so we must split each half into
196	// separate registers. To compensate, we pair computations up and
197	// parallelize.
198
199	ext	v19.8b, v3.8b, v3.8b, #4	// B4
200	eor	v18.16b, v18.16b, v1.16b	// N = I + J
201	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
202
203	// This can probably be scheduled more efficiently. For now, we just
204	// pair up independent instructions.
205	zip1	v20.2d, v16.2d, v17.2d
206	zip1	v22.2d, v18.2d, v19.2d
207	zip2	v21.2d, v16.2d, v17.2d
208	zip2	v23.2d, v18.2d, v19.2d
209	eor	v20.16b, v20.16b, v21.16b
210	eor	v22.16b, v22.16b, v23.16b
211	and	v21.16b, v21.16b, v24.16b
212	and	v23.16b, v23.16b, v25.16b
213	eor	v20.16b, v20.16b, v21.16b
214	eor	v22.16b, v22.16b, v23.16b
215	zip1	v16.2d, v20.2d, v21.2d
216	zip1	v18.2d, v22.2d, v23.2d
217	zip2	v17.2d, v20.2d, v21.2d
218	zip2	v19.2d, v22.2d, v23.2d
219
220	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
221	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
222	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
223	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
224	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
225	eor	v16.16b, v16.16b, v17.16b
226	eor	v18.16b, v18.16b, v19.16b
227	eor	v1.16b, v1.16b, v16.16b
228	eor	v1.16b, v1.16b, v18.16b
229	ext	v16.8b, v6.8b, v6.8b, #1	// A1
230	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
231	ext	v2.8b, v4.8b, v4.8b, #1		// B1
232	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
233	ext	v17.8b, v6.8b, v6.8b, #2	// A2
234	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
235	ext	v19.8b, v4.8b, v4.8b, #2	// B2
236	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
237	ext	v18.8b, v6.8b, v6.8b, #3	// A3
238	eor	v16.16b, v16.16b, v2.16b	// L = E + F
239	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
240	ext	v2.8b, v4.8b, v4.8b, #3		// B3
241	eor	v17.16b, v17.16b, v19.16b	// M = G + H
242	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
243
244	// Here we diverge from the 32-bit version. It computes the following
245	// (instructions reordered for clarity):
246	//
247	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
248	//     vand	$t0#hi, $t0#hi, $k48
249	//     veor	$t0#lo, $t0#lo, $t0#hi
250	//
251	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
252	//     vand	$t1#hi, $t1#hi, $k32
253	//     veor	$t1#lo, $t1#lo, $t1#hi
254	//
255	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
256	//     vand	$t2#hi, $t2#hi, $k16
257	//     veor	$t2#lo, $t2#lo, $t2#hi
258	//
259	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
260	//     vmov.i64	$t3#hi, #0
261	//
262	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
263	// upper halves of SIMD registers, so we must split each half into
264	// separate registers. To compensate, we pair computations up and
265	// parallelize.
266
267	ext	v19.8b, v4.8b, v4.8b, #4	// B4
268	eor	v18.16b, v18.16b, v2.16b	// N = I + J
269	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
270
271	// This can probably be scheduled more efficiently. For now, we just
272	// pair up independent instructions.
273	zip1	v20.2d, v16.2d, v17.2d
274	zip1	v22.2d, v18.2d, v19.2d
275	zip2	v21.2d, v16.2d, v17.2d
276	zip2	v23.2d, v18.2d, v19.2d
277	eor	v20.16b, v20.16b, v21.16b
278	eor	v22.16b, v22.16b, v23.16b
279	and	v21.16b, v21.16b, v24.16b
280	and	v23.16b, v23.16b, v25.16b
281	eor	v20.16b, v20.16b, v21.16b
282	eor	v22.16b, v22.16b, v23.16b
283	zip1	v16.2d, v20.2d, v21.2d
284	zip1	v18.2d, v22.2d, v23.2d
285	zip2	v17.2d, v20.2d, v21.2d
286	zip2	v19.2d, v22.2d, v23.2d
287
288	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
289	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
290	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
291	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
292	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
293	eor	v16.16b, v16.16b, v17.16b
294	eor	v18.16b, v18.16b, v19.16b
295	eor	v2.16b, v2.16b, v16.16b
296	eor	v2.16b, v2.16b, v18.16b
297	ext	v16.16b, v0.16b, v2.16b, #8
298	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
299	eor	v1.16b, v1.16b, v2.16b
300	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
301	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
302	// This is a no-op due to the ins instruction below.
303	// ins	v2.d[0], v1.d[1]
304
305	// equivalent of reduction_avx from ghash-x86_64.pl
306	shl	v17.2d, v0.2d, #57		// 1st phase
307	shl	v18.2d, v0.2d, #62
308	eor	v18.16b, v18.16b, v17.16b	//
309	shl	v17.2d, v0.2d, #63
310	eor	v18.16b, v18.16b, v17.16b	//
311	// Note Xm contains {Xl.d[1], Xh.d[0]}.
312	eor	v18.16b, v18.16b, v1.16b
313	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
314	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
315
316	ushr	v18.2d, v0.2d, #1		// 2nd phase
317	eor	v2.16b, v2.16b,v0.16b
318	eor	v0.16b, v0.16b,v18.16b	//
319	ushr	v18.2d, v18.2d, #6
320	ushr	v0.2d, v0.2d, #1		//
321	eor	v0.16b, v0.16b, v2.16b	//
322	eor	v0.16b, v0.16b, v18.16b	//
323
324	subs	x3, x3, #16
325	bne	.Loop_neon
326
327	rev64	v0.16b, v0.16b		// byteswap Xi and write
328	ext	v0.16b, v0.16b, v0.16b, #8
329	st1	{v0.16b}, [x0]
330
331	ret
332.size	gcm_ghash_neon,.-gcm_ghash_neon
333
334.section	.rodata
335.align	4
336.Lmasks:
337.quad	0x0000ffffffffffff	// k48
338.quad	0x00000000ffffffff	// k32
339.quad	0x000000000000ffff	// k16
340.quad	0x0000000000000000	// k0
341.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
342.align	2
343.align	2
344#endif
345#endif  // !OPENSSL_NO_ASM
346.section	.note.GNU-stack,"",%progbits
347