• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16.text
17
18.globl	gcm_init_neon
19.hidden	gcm_init_neon
20.type	gcm_init_neon,%function
21.align	4
22gcm_init_neon:
23	// This function is adapted from gcm_init_v8. xC2 is t3.
24	ld1	{v17.2d}, [x1]			// load H
25	movi	v19.16b, #0xe1
26	shl	v19.2d, v19.2d, #57		// 0xc2.0
27	ext	v3.16b, v17.16b, v17.16b, #8
28	ushr	v18.2d, v19.2d, #63
29	dup	v17.4s, v17.s[1]
30	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
31	ushr	v18.2d, v3.2d, #63
32	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
33	and	v18.16b, v18.16b, v16.16b
34	shl	v3.2d, v3.2d, #1
35	ext	v18.16b, v18.16b, v18.16b, #8
36	and	v16.16b, v16.16b, v17.16b
37	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
38	eor	v5.16b, v3.16b, v16.16b	// twisted H
39	st1	{v5.2d}, [x0]			// store Htable[0]
40	ret
41.size	gcm_init_neon,.-gcm_init_neon
42
43.globl	gcm_gmult_neon
44.hidden	gcm_gmult_neon
45.type	gcm_gmult_neon,%function
46.align	4
47gcm_gmult_neon:
48	ld1	{v3.16b}, [x0]		// load Xi
49	ld1	{v5.1d}, [x1], #8		// load twisted H
50	ld1	{v6.1d}, [x1]
51	adrp	x9, .Lmasks		// load constants
52	add	x9, x9, :lo12:.Lmasks
53	ld1	{v24.2d, v25.2d}, [x9]
54	rev64	v3.16b, v3.16b		// byteswap Xi
55	ext	v3.16b, v3.16b, v3.16b, #8
56	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
57
58	mov	x3, #16
59	b	.Lgmult_neon
60.size	gcm_gmult_neon,.-gcm_gmult_neon
61
62.globl	gcm_ghash_neon
63.hidden	gcm_ghash_neon
64.type	gcm_ghash_neon,%function
65.align	4
66gcm_ghash_neon:
67	ld1	{v0.16b}, [x0]		// load Xi
68	ld1	{v5.1d}, [x1], #8		// load twisted H
69	ld1	{v6.1d}, [x1]
70	adrp	x9, .Lmasks		// load constants
71	add	x9, x9, :lo12:.Lmasks
72	ld1	{v24.2d, v25.2d}, [x9]
73	rev64	v0.16b, v0.16b		// byteswap Xi
74	ext	v0.16b, v0.16b, v0.16b, #8
75	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
76
77.Loop_neon:
78	ld1	{v3.16b}, [x2], #16	// load inp
79	rev64	v3.16b, v3.16b		// byteswap inp
80	ext	v3.16b, v3.16b, v3.16b, #8
81	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
82
83.Lgmult_neon:
84	// Split the input into v3 and v4. (The upper halves are unused,
85	// so it is okay to leave them alone.)
86	ins	v4.d[0], v3.d[1]
87	ext	v16.8b, v5.8b, v5.8b, #1	// A1
88	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
89	ext	v0.8b, v3.8b, v3.8b, #1		// B1
90	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
91	ext	v17.8b, v5.8b, v5.8b, #2	// A2
92	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
93	ext	v19.8b, v3.8b, v3.8b, #2	// B2
94	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
95	ext	v18.8b, v5.8b, v5.8b, #3	// A3
96	eor	v16.16b, v16.16b, v0.16b	// L = E + F
97	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
98	ext	v0.8b, v3.8b, v3.8b, #3		// B3
99	eor	v17.16b, v17.16b, v19.16b	// M = G + H
100	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
101
102	// Here we diverge from the 32-bit version. It computes the following
103	// (instructions reordered for clarity):
104	//
105	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
106	//     vand	$t0#hi, $t0#hi, $k48
107	//     veor	$t0#lo, $t0#lo, $t0#hi
108	//
109	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
110	//     vand	$t1#hi, $t1#hi, $k32
111	//     veor	$t1#lo, $t1#lo, $t1#hi
112	//
113	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
114	//     vand	$t2#hi, $t2#hi, $k16
115	//     veor	$t2#lo, $t2#lo, $t2#hi
116	//
117	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
118	//     vmov.i64	$t3#hi, #0
119	//
120	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
121	// upper halves of SIMD registers, so we must split each half into
122	// separate registers. To compensate, we pair computations up and
123	// parallelize.
124
125	ext	v19.8b, v3.8b, v3.8b, #4	// B4
126	eor	v18.16b, v18.16b, v0.16b	// N = I + J
127	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
128
129	// This can probably be scheduled more efficiently. For now, we just
130	// pair up independent instructions.
131	zip1	v20.2d, v16.2d, v17.2d
132	zip1	v22.2d, v18.2d, v19.2d
133	zip2	v21.2d, v16.2d, v17.2d
134	zip2	v23.2d, v18.2d, v19.2d
135	eor	v20.16b, v20.16b, v21.16b
136	eor	v22.16b, v22.16b, v23.16b
137	and	v21.16b, v21.16b, v24.16b
138	and	v23.16b, v23.16b, v25.16b
139	eor	v20.16b, v20.16b, v21.16b
140	eor	v22.16b, v22.16b, v23.16b
141	zip1	v16.2d, v20.2d, v21.2d
142	zip1	v18.2d, v22.2d, v23.2d
143	zip2	v17.2d, v20.2d, v21.2d
144	zip2	v19.2d, v22.2d, v23.2d
145
146	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
147	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
148	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
149	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
150	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
151	eor	v16.16b, v16.16b, v17.16b
152	eor	v18.16b, v18.16b, v19.16b
153	eor	v0.16b, v0.16b, v16.16b
154	eor	v0.16b, v0.16b, v18.16b
155	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
156	ext	v16.8b, v7.8b, v7.8b, #1	// A1
157	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
158	ext	v1.8b, v3.8b, v3.8b, #1		// B1
159	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
160	ext	v17.8b, v7.8b, v7.8b, #2	// A2
161	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
162	ext	v19.8b, v3.8b, v3.8b, #2	// B2
163	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
164	ext	v18.8b, v7.8b, v7.8b, #3	// A3
165	eor	v16.16b, v16.16b, v1.16b	// L = E + F
166	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
167	ext	v1.8b, v3.8b, v3.8b, #3		// B3
168	eor	v17.16b, v17.16b, v19.16b	// M = G + H
169	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
170
171	// Here we diverge from the 32-bit version. It computes the following
172	// (instructions reordered for clarity):
173	//
174	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
175	//     vand	$t0#hi, $t0#hi, $k48
176	//     veor	$t0#lo, $t0#lo, $t0#hi
177	//
178	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
179	//     vand	$t1#hi, $t1#hi, $k32
180	//     veor	$t1#lo, $t1#lo, $t1#hi
181	//
182	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
183	//     vand	$t2#hi, $t2#hi, $k16
184	//     veor	$t2#lo, $t2#lo, $t2#hi
185	//
186	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
187	//     vmov.i64	$t3#hi, #0
188	//
189	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
190	// upper halves of SIMD registers, so we must split each half into
191	// separate registers. To compensate, we pair computations up and
192	// parallelize.
193
194	ext	v19.8b, v3.8b, v3.8b, #4	// B4
195	eor	v18.16b, v18.16b, v1.16b	// N = I + J
196	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
197
198	// This can probably be scheduled more efficiently. For now, we just
199	// pair up independent instructions.
200	zip1	v20.2d, v16.2d, v17.2d
201	zip1	v22.2d, v18.2d, v19.2d
202	zip2	v21.2d, v16.2d, v17.2d
203	zip2	v23.2d, v18.2d, v19.2d
204	eor	v20.16b, v20.16b, v21.16b
205	eor	v22.16b, v22.16b, v23.16b
206	and	v21.16b, v21.16b, v24.16b
207	and	v23.16b, v23.16b, v25.16b
208	eor	v20.16b, v20.16b, v21.16b
209	eor	v22.16b, v22.16b, v23.16b
210	zip1	v16.2d, v20.2d, v21.2d
211	zip1	v18.2d, v22.2d, v23.2d
212	zip2	v17.2d, v20.2d, v21.2d
213	zip2	v19.2d, v22.2d, v23.2d
214
215	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
216	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
217	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
218	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
219	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
220	eor	v16.16b, v16.16b, v17.16b
221	eor	v18.16b, v18.16b, v19.16b
222	eor	v1.16b, v1.16b, v16.16b
223	eor	v1.16b, v1.16b, v18.16b
224	ext	v16.8b, v6.8b, v6.8b, #1	// A1
225	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
226	ext	v2.8b, v4.8b, v4.8b, #1		// B1
227	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
228	ext	v17.8b, v6.8b, v6.8b, #2	// A2
229	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
230	ext	v19.8b, v4.8b, v4.8b, #2	// B2
231	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
232	ext	v18.8b, v6.8b, v6.8b, #3	// A3
233	eor	v16.16b, v16.16b, v2.16b	// L = E + F
234	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
235	ext	v2.8b, v4.8b, v4.8b, #3		// B3
236	eor	v17.16b, v17.16b, v19.16b	// M = G + H
237	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
238
239	// Here we diverge from the 32-bit version. It computes the following
240	// (instructions reordered for clarity):
241	//
242	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
243	//     vand	$t0#hi, $t0#hi, $k48
244	//     veor	$t0#lo, $t0#lo, $t0#hi
245	//
246	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
247	//     vand	$t1#hi, $t1#hi, $k32
248	//     veor	$t1#lo, $t1#lo, $t1#hi
249	//
250	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
251	//     vand	$t2#hi, $t2#hi, $k16
252	//     veor	$t2#lo, $t2#lo, $t2#hi
253	//
254	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
255	//     vmov.i64	$t3#hi, #0
256	//
257	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
258	// upper halves of SIMD registers, so we must split each half into
259	// separate registers. To compensate, we pair computations up and
260	// parallelize.
261
262	ext	v19.8b, v4.8b, v4.8b, #4	// B4
263	eor	v18.16b, v18.16b, v2.16b	// N = I + J
264	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
265
266	// This can probably be scheduled more efficiently. For now, we just
267	// pair up independent instructions.
268	zip1	v20.2d, v16.2d, v17.2d
269	zip1	v22.2d, v18.2d, v19.2d
270	zip2	v21.2d, v16.2d, v17.2d
271	zip2	v23.2d, v18.2d, v19.2d
272	eor	v20.16b, v20.16b, v21.16b
273	eor	v22.16b, v22.16b, v23.16b
274	and	v21.16b, v21.16b, v24.16b
275	and	v23.16b, v23.16b, v25.16b
276	eor	v20.16b, v20.16b, v21.16b
277	eor	v22.16b, v22.16b, v23.16b
278	zip1	v16.2d, v20.2d, v21.2d
279	zip1	v18.2d, v22.2d, v23.2d
280	zip2	v17.2d, v20.2d, v21.2d
281	zip2	v19.2d, v22.2d, v23.2d
282
283	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
284	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
285	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
286	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
287	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
288	eor	v16.16b, v16.16b, v17.16b
289	eor	v18.16b, v18.16b, v19.16b
290	eor	v2.16b, v2.16b, v16.16b
291	eor	v2.16b, v2.16b, v18.16b
292	ext	v16.16b, v0.16b, v2.16b, #8
293	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
294	eor	v1.16b, v1.16b, v2.16b
295	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
296	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
297	// This is a no-op due to the ins instruction below.
298	// ins	v2.d[0], v1.d[1]
299
300	// equivalent of reduction_avx from ghash-x86_64.pl
301	shl	v17.2d, v0.2d, #57		// 1st phase
302	shl	v18.2d, v0.2d, #62
303	eor	v18.16b, v18.16b, v17.16b	//
304	shl	v17.2d, v0.2d, #63
305	eor	v18.16b, v18.16b, v17.16b	//
306	// Note Xm contains {Xl.d[1], Xh.d[0]}.
307	eor	v18.16b, v18.16b, v1.16b
308	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
309	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
310
311	ushr	v18.2d, v0.2d, #1		// 2nd phase
312	eor	v2.16b, v2.16b,v0.16b
313	eor	v0.16b, v0.16b,v18.16b	//
314	ushr	v18.2d, v18.2d, #6
315	ushr	v0.2d, v0.2d, #1		//
316	eor	v0.16b, v0.16b, v2.16b	//
317	eor	v0.16b, v0.16b, v18.16b	//
318
319	subs	x3, x3, #16
320	bne	.Loop_neon
321
322	rev64	v0.16b, v0.16b		// byteswap Xi and write
323	ext	v0.16b, v0.16b, v0.16b, #8
324	st1	{v0.16b}, [x0]
325
326	ret
327.size	gcm_ghash_neon,.-gcm_ghash_neon
328
329.section	.rodata
330.align	4
331.Lmasks:
332.quad	0x0000ffffffffffff	// k48
333.quad	0x00000000ffffffff	// k32
334.quad	0x000000000000ffff	// k16
335.quad	0x0000000000000000	// k0
336.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
337.align	2
338.align	2
339#endif
340#endif  // !OPENSSL_NO_ASM
341.section	.note.GNU-stack,"",%progbits
342