• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18.text
19
20.globl	gcm_init_neon
21
22.def gcm_init_neon
23   .type 32
24.endef
25.align	4
26gcm_init_neon:
27	AARCH64_VALID_CALL_TARGET
28	// This function is adapted from gcm_init_v8. xC2 is t3.
29	ld1	{v17.2d}, [x1]			// load H
30	movi	v19.16b, #0xe1
31	shl	v19.2d, v19.2d, #57		// 0xc2.0
32	ext	v3.16b, v17.16b, v17.16b, #8
33	ushr	v18.2d, v19.2d, #63
34	dup	v17.4s, v17.s[1]
35	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
36	ushr	v18.2d, v3.2d, #63
37	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
38	and	v18.16b, v18.16b, v16.16b
39	shl	v3.2d, v3.2d, #1
40	ext	v18.16b, v18.16b, v18.16b, #8
41	and	v16.16b, v16.16b, v17.16b
42	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
43	eor	v5.16b, v3.16b, v16.16b	// twisted H
44	st1	{v5.2d}, [x0]			// store Htable[0]
45	ret
46
47
48.globl	gcm_gmult_neon
49
50.def gcm_gmult_neon
51   .type 32
52.endef
53.align	4
54gcm_gmult_neon:
55	AARCH64_VALID_CALL_TARGET
56	ld1	{v3.16b}, [x0]		// load Xi
57	ld1	{v5.1d}, [x1], #8		// load twisted H
58	ld1	{v6.1d}, [x1]
59	adrp	x9, Lmasks		// load constants
60	add	x9, x9, :lo12:Lmasks
61	ld1	{v24.2d, v25.2d}, [x9]
62	rev64	v3.16b, v3.16b		// byteswap Xi
63	ext	v3.16b, v3.16b, v3.16b, #8
64	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
65
66	mov	x3, #16
67	b	Lgmult_neon
68
69
70.globl	gcm_ghash_neon
71
72.def gcm_ghash_neon
73   .type 32
74.endef
75.align	4
76gcm_ghash_neon:
77	AARCH64_VALID_CALL_TARGET
78	ld1	{v0.16b}, [x0]		// load Xi
79	ld1	{v5.1d}, [x1], #8		// load twisted H
80	ld1	{v6.1d}, [x1]
81	adrp	x9, Lmasks		// load constants
82	add	x9, x9, :lo12:Lmasks
83	ld1	{v24.2d, v25.2d}, [x9]
84	rev64	v0.16b, v0.16b		// byteswap Xi
85	ext	v0.16b, v0.16b, v0.16b, #8
86	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
87
88Loop_neon:
89	ld1	{v3.16b}, [x2], #16	// load inp
90	rev64	v3.16b, v3.16b		// byteswap inp
91	ext	v3.16b, v3.16b, v3.16b, #8
92	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
93
94Lgmult_neon:
95	// Split the input into v3 and v4. (The upper halves are unused,
96	// so it is okay to leave them alone.)
97	ins	v4.d[0], v3.d[1]
98	ext	v16.8b, v5.8b, v5.8b, #1	// A1
99	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
100	ext	v0.8b, v3.8b, v3.8b, #1		// B1
101	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
102	ext	v17.8b, v5.8b, v5.8b, #2	// A2
103	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
104	ext	v19.8b, v3.8b, v3.8b, #2	// B2
105	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
106	ext	v18.8b, v5.8b, v5.8b, #3	// A3
107	eor	v16.16b, v16.16b, v0.16b	// L = E + F
108	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
109	ext	v0.8b, v3.8b, v3.8b, #3		// B3
110	eor	v17.16b, v17.16b, v19.16b	// M = G + H
111	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
112
113	// Here we diverge from the 32-bit version. It computes the following
114	// (instructions reordered for clarity):
115	//
116	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
117	//     vand	$t0#hi, $t0#hi, $k48
118	//     veor	$t0#lo, $t0#lo, $t0#hi
119	//
120	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
121	//     vand	$t1#hi, $t1#hi, $k32
122	//     veor	$t1#lo, $t1#lo, $t1#hi
123	//
124	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
125	//     vand	$t2#hi, $t2#hi, $k16
126	//     veor	$t2#lo, $t2#lo, $t2#hi
127	//
128	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
129	//     vmov.i64	$t3#hi, #0
130	//
131	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
132	// upper halves of SIMD registers, so we must split each half into
133	// separate registers. To compensate, we pair computations up and
134	// parallelize.
135
136	ext	v19.8b, v3.8b, v3.8b, #4	// B4
137	eor	v18.16b, v18.16b, v0.16b	// N = I + J
138	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
139
140	// This can probably be scheduled more efficiently. For now, we just
141	// pair up independent instructions.
142	zip1	v20.2d, v16.2d, v17.2d
143	zip1	v22.2d, v18.2d, v19.2d
144	zip2	v21.2d, v16.2d, v17.2d
145	zip2	v23.2d, v18.2d, v19.2d
146	eor	v20.16b, v20.16b, v21.16b
147	eor	v22.16b, v22.16b, v23.16b
148	and	v21.16b, v21.16b, v24.16b
149	and	v23.16b, v23.16b, v25.16b
150	eor	v20.16b, v20.16b, v21.16b
151	eor	v22.16b, v22.16b, v23.16b
152	zip1	v16.2d, v20.2d, v21.2d
153	zip1	v18.2d, v22.2d, v23.2d
154	zip2	v17.2d, v20.2d, v21.2d
155	zip2	v19.2d, v22.2d, v23.2d
156
157	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
158	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
159	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
160	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
161	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
162	eor	v16.16b, v16.16b, v17.16b
163	eor	v18.16b, v18.16b, v19.16b
164	eor	v0.16b, v0.16b, v16.16b
165	eor	v0.16b, v0.16b, v18.16b
166	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
167	ext	v16.8b, v7.8b, v7.8b, #1	// A1
168	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
169	ext	v1.8b, v3.8b, v3.8b, #1		// B1
170	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
171	ext	v17.8b, v7.8b, v7.8b, #2	// A2
172	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
173	ext	v19.8b, v3.8b, v3.8b, #2	// B2
174	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
175	ext	v18.8b, v7.8b, v7.8b, #3	// A3
176	eor	v16.16b, v16.16b, v1.16b	// L = E + F
177	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
178	ext	v1.8b, v3.8b, v3.8b, #3		// B3
179	eor	v17.16b, v17.16b, v19.16b	// M = G + H
180	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
181
182	// Here we diverge from the 32-bit version. It computes the following
183	// (instructions reordered for clarity):
184	//
185	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
186	//     vand	$t0#hi, $t0#hi, $k48
187	//     veor	$t0#lo, $t0#lo, $t0#hi
188	//
189	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
190	//     vand	$t1#hi, $t1#hi, $k32
191	//     veor	$t1#lo, $t1#lo, $t1#hi
192	//
193	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
194	//     vand	$t2#hi, $t2#hi, $k16
195	//     veor	$t2#lo, $t2#lo, $t2#hi
196	//
197	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
198	//     vmov.i64	$t3#hi, #0
199	//
200	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
201	// upper halves of SIMD registers, so we must split each half into
202	// separate registers. To compensate, we pair computations up and
203	// parallelize.
204
205	ext	v19.8b, v3.8b, v3.8b, #4	// B4
206	eor	v18.16b, v18.16b, v1.16b	// N = I + J
207	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
208
209	// This can probably be scheduled more efficiently. For now, we just
210	// pair up independent instructions.
211	zip1	v20.2d, v16.2d, v17.2d
212	zip1	v22.2d, v18.2d, v19.2d
213	zip2	v21.2d, v16.2d, v17.2d
214	zip2	v23.2d, v18.2d, v19.2d
215	eor	v20.16b, v20.16b, v21.16b
216	eor	v22.16b, v22.16b, v23.16b
217	and	v21.16b, v21.16b, v24.16b
218	and	v23.16b, v23.16b, v25.16b
219	eor	v20.16b, v20.16b, v21.16b
220	eor	v22.16b, v22.16b, v23.16b
221	zip1	v16.2d, v20.2d, v21.2d
222	zip1	v18.2d, v22.2d, v23.2d
223	zip2	v17.2d, v20.2d, v21.2d
224	zip2	v19.2d, v22.2d, v23.2d
225
226	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
227	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
228	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
229	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
230	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
231	eor	v16.16b, v16.16b, v17.16b
232	eor	v18.16b, v18.16b, v19.16b
233	eor	v1.16b, v1.16b, v16.16b
234	eor	v1.16b, v1.16b, v18.16b
235	ext	v16.8b, v6.8b, v6.8b, #1	// A1
236	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
237	ext	v2.8b, v4.8b, v4.8b, #1		// B1
238	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
239	ext	v17.8b, v6.8b, v6.8b, #2	// A2
240	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
241	ext	v19.8b, v4.8b, v4.8b, #2	// B2
242	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
243	ext	v18.8b, v6.8b, v6.8b, #3	// A3
244	eor	v16.16b, v16.16b, v2.16b	// L = E + F
245	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
246	ext	v2.8b, v4.8b, v4.8b, #3		// B3
247	eor	v17.16b, v17.16b, v19.16b	// M = G + H
248	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
249
250	// Here we diverge from the 32-bit version. It computes the following
251	// (instructions reordered for clarity):
252	//
253	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
254	//     vand	$t0#hi, $t0#hi, $k48
255	//     veor	$t0#lo, $t0#lo, $t0#hi
256	//
257	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
258	//     vand	$t1#hi, $t1#hi, $k32
259	//     veor	$t1#lo, $t1#lo, $t1#hi
260	//
261	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
262	//     vand	$t2#hi, $t2#hi, $k16
263	//     veor	$t2#lo, $t2#lo, $t2#hi
264	//
265	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
266	//     vmov.i64	$t3#hi, #0
267	//
268	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
269	// upper halves of SIMD registers, so we must split each half into
270	// separate registers. To compensate, we pair computations up and
271	// parallelize.
272
273	ext	v19.8b, v4.8b, v4.8b, #4	// B4
274	eor	v18.16b, v18.16b, v2.16b	// N = I + J
275	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
276
277	// This can probably be scheduled more efficiently. For now, we just
278	// pair up independent instructions.
279	zip1	v20.2d, v16.2d, v17.2d
280	zip1	v22.2d, v18.2d, v19.2d
281	zip2	v21.2d, v16.2d, v17.2d
282	zip2	v23.2d, v18.2d, v19.2d
283	eor	v20.16b, v20.16b, v21.16b
284	eor	v22.16b, v22.16b, v23.16b
285	and	v21.16b, v21.16b, v24.16b
286	and	v23.16b, v23.16b, v25.16b
287	eor	v20.16b, v20.16b, v21.16b
288	eor	v22.16b, v22.16b, v23.16b
289	zip1	v16.2d, v20.2d, v21.2d
290	zip1	v18.2d, v22.2d, v23.2d
291	zip2	v17.2d, v20.2d, v21.2d
292	zip2	v19.2d, v22.2d, v23.2d
293
294	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
295	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
296	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
297	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
298	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
299	eor	v16.16b, v16.16b, v17.16b
300	eor	v18.16b, v18.16b, v19.16b
301	eor	v2.16b, v2.16b, v16.16b
302	eor	v2.16b, v2.16b, v18.16b
303	ext	v16.16b, v0.16b, v2.16b, #8
304	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
305	eor	v1.16b, v1.16b, v2.16b
306	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
307	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
308	// This is a no-op due to the ins instruction below.
309	// ins	v2.d[0], v1.d[1]
310
311	// equivalent of reduction_avx from ghash-x86_64.pl
312	shl	v17.2d, v0.2d, #57		// 1st phase
313	shl	v18.2d, v0.2d, #62
314	eor	v18.16b, v18.16b, v17.16b	//
315	shl	v17.2d, v0.2d, #63
316	eor	v18.16b, v18.16b, v17.16b	//
317	// Note Xm contains {Xl.d[1], Xh.d[0]}.
318	eor	v18.16b, v18.16b, v1.16b
319	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
320	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
321
322	ushr	v18.2d, v0.2d, #1		// 2nd phase
323	eor	v2.16b, v2.16b,v0.16b
324	eor	v0.16b, v0.16b,v18.16b	//
325	ushr	v18.2d, v18.2d, #6
326	ushr	v0.2d, v0.2d, #1		//
327	eor	v0.16b, v0.16b, v2.16b	//
328	eor	v0.16b, v0.16b, v18.16b	//
329
330	subs	x3, x3, #16
331	bne	Loop_neon
332
333	rev64	v0.16b, v0.16b		// byteswap Xi and write
334	ext	v0.16b, v0.16b, v0.16b, #8
335	st1	{v0.16b}, [x0]
336
337	ret
338
339
340.section	.rodata
341.align	4
342Lmasks:
343.quad	0x0000ffffffffffff	// k48
344.quad	0x00000000ffffffff	// k32
345.quad	0x000000000000ffff	// k16
346.quad	0x0000000000000000	// k0
347.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
348.align	2
349.align	2
350#endif
351#endif  // !OPENSSL_NO_ASM
352