• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18#if __ARM_MAX_ARCH__>=7
19.text
20.arch	armv8-a+crypto
21.globl	gcm_init_v8
22
23.def gcm_init_v8
24   .type 32
25.endef
26.align	4
27gcm_init_v8:
28	AARCH64_VALID_CALL_TARGET
29	ld1	{v17.2d},[x1]		//load input H
30	movi	v19.16b,#0xe1
31	shl	v19.2d,v19.2d,#57		//0xc2.0
32	ext	v3.16b,v17.16b,v17.16b,#8
33	ushr	v18.2d,v19.2d,#63
34	dup	v17.4s,v17.s[1]
35	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
36	ushr	v18.2d,v3.2d,#63
37	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
38	and	v18.16b,v18.16b,v16.16b
39	shl	v3.2d,v3.2d,#1
40	ext	v18.16b,v18.16b,v18.16b,#8
41	and	v16.16b,v16.16b,v17.16b
42	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
43	eor	v20.16b,v3.16b,v16.16b		//twisted H
44	st1	{v20.2d},[x0],#16		//store Htable[0]
45
46	//calculate H^2
47	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
48	pmull	v0.1q,v20.1d,v20.1d
49	eor	v16.16b,v16.16b,v20.16b
50	pmull2	v2.1q,v20.2d,v20.2d
51	pmull	v1.1q,v16.1d,v16.1d
52
53	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
54	eor	v18.16b,v0.16b,v2.16b
55	eor	v1.16b,v1.16b,v17.16b
56	eor	v1.16b,v1.16b,v18.16b
57	pmull	v18.1q,v0.1d,v19.1d		//1st phase
58
59	ins	v2.d[0],v1.d[1]
60	ins	v1.d[1],v0.d[0]
61	eor	v0.16b,v1.16b,v18.16b
62
63	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
64	pmull	v0.1q,v0.1d,v19.1d
65	eor	v18.16b,v18.16b,v2.16b
66	eor	v22.16b,v0.16b,v18.16b
67
68	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
69	eor	v17.16b,v17.16b,v22.16b
70	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
71	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
72	//calculate H^3 and H^4
73	pmull	v0.1q,v20.1d, v22.1d
74	pmull	v5.1q,v22.1d,v22.1d
75	pmull2	v2.1q,v20.2d, v22.2d
76	pmull2	v7.1q,v22.2d,v22.2d
77	pmull	v1.1q,v16.1d,v17.1d
78	pmull	v6.1q,v17.1d,v17.1d
79
80	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
81	ext	v17.16b,v5.16b,v7.16b,#8
82	eor	v18.16b,v0.16b,v2.16b
83	eor	v1.16b,v1.16b,v16.16b
84	eor	v4.16b,v5.16b,v7.16b
85	eor	v6.16b,v6.16b,v17.16b
86	eor	v1.16b,v1.16b,v18.16b
87	pmull	v18.1q,v0.1d,v19.1d		//1st phase
88	eor	v6.16b,v6.16b,v4.16b
89	pmull	v4.1q,v5.1d,v19.1d
90
91	ins	v2.d[0],v1.d[1]
92	ins	v7.d[0],v6.d[1]
93	ins	v1.d[1],v0.d[0]
94	ins	v6.d[1],v5.d[0]
95	eor	v0.16b,v1.16b,v18.16b
96	eor	v5.16b,v6.16b,v4.16b
97
98	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
99	ext	v4.16b,v5.16b,v5.16b,#8
100	pmull	v0.1q,v0.1d,v19.1d
101	pmull	v5.1q,v5.1d,v19.1d
102	eor	v18.16b,v18.16b,v2.16b
103	eor	v4.16b,v4.16b,v7.16b
104	eor	v20.16b, v0.16b,v18.16b		//H^3
105	eor	v22.16b,v5.16b,v4.16b		//H^4
106
107	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
108	ext	v17.16b,v22.16b,v22.16b,#8
109	eor	v16.16b,v16.16b,v20.16b
110	eor	v17.16b,v17.16b,v22.16b
111	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
112	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
113	ret
114
115.globl	gcm_gmult_v8
116
117.def gcm_gmult_v8
118   .type 32
119.endef
120.align	4
121gcm_gmult_v8:
122	AARCH64_VALID_CALL_TARGET
123	ld1	{v17.2d},[x0]		//load Xi
124	movi	v19.16b,#0xe1
125	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
126	shl	v19.2d,v19.2d,#57
127#ifndef __ARMEB__
128	rev64	v17.16b,v17.16b
129#endif
130	ext	v3.16b,v17.16b,v17.16b,#8
131
132	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
133	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
134	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
135	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
136
137	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
138	eor	v18.16b,v0.16b,v2.16b
139	eor	v1.16b,v1.16b,v17.16b
140	eor	v1.16b,v1.16b,v18.16b
141	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
142
143	ins	v2.d[0],v1.d[1]
144	ins	v1.d[1],v0.d[0]
145	eor	v0.16b,v1.16b,v18.16b
146
147	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
148	pmull	v0.1q,v0.1d,v19.1d
149	eor	v18.16b,v18.16b,v2.16b
150	eor	v0.16b,v0.16b,v18.16b
151
152#ifndef __ARMEB__
153	rev64	v0.16b,v0.16b
154#endif
155	ext	v0.16b,v0.16b,v0.16b,#8
156	st1	{v0.2d},[x0]		//write out Xi
157
158	ret
159
160.globl	gcm_ghash_v8
161
162.def gcm_ghash_v8
163   .type 32
164.endef
165.align	4
166gcm_ghash_v8:
167	AARCH64_VALID_CALL_TARGET
168	cmp	x3,#64
169	b.hs	Lgcm_ghash_v8_4x
170	ld1	{v0.2d},[x0]		//load [rotated] Xi
171						//"[rotated]" means that
172						//loaded value would have
173						//to be rotated in order to
174						//make it appear as in
175						//algorithm specification
176	subs	x3,x3,#32		//see if x3 is 32 or larger
177	mov	x12,#16		//x12 is used as post-
178						//increment for input pointer;
179						//as loop is modulo-scheduled
180						//x12 is zeroed just in time
181						//to preclude overstepping
182						//inp[len], which means that
183						//last block[s] are actually
184						//loaded twice, but last
185						//copy is not processed
186	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
187	movi	v19.16b,#0xe1
188	ld1	{v22.2d},[x1]
189	csel	x12,xzr,x12,eq			//is it time to zero x12?
190	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
191	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
192	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
193#ifndef __ARMEB__
194	rev64	v16.16b,v16.16b
195	rev64	v0.16b,v0.16b
196#endif
197	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
198	b.lo	Lodd_tail_v8		//x3 was less than 32
199	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
200#ifndef __ARMEB__
201	rev64	v17.16b,v17.16b
202#endif
203	ext	v7.16b,v17.16b,v17.16b,#8
204	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
205	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
206	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
207	pmull2	v6.1q,v20.2d,v7.2d
208	b	Loop_mod2x_v8
209
210.align	4
211Loop_mod2x_v8:
212	ext	v18.16b,v3.16b,v3.16b,#8
213	subs	x3,x3,#32		//is there more data?
214	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
215	csel	x12,xzr,x12,lo			//is it time to zero x12?
216
217	pmull	v5.1q,v21.1d,v17.1d
218	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
219	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
220	eor	v0.16b,v0.16b,v4.16b		//accumulate
221	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
222	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
223
224	eor	v2.16b,v2.16b,v6.16b
225	csel	x12,xzr,x12,eq			//is it time to zero x12?
226	eor	v1.16b,v1.16b,v5.16b
227
228	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
229	eor	v18.16b,v0.16b,v2.16b
230	eor	v1.16b,v1.16b,v17.16b
231	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
232#ifndef __ARMEB__
233	rev64	v16.16b,v16.16b
234#endif
235	eor	v1.16b,v1.16b,v18.16b
236	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
237
238#ifndef __ARMEB__
239	rev64	v17.16b,v17.16b
240#endif
241	ins	v2.d[0],v1.d[1]
242	ins	v1.d[1],v0.d[0]
243	ext	v7.16b,v17.16b,v17.16b,#8
244	ext	v3.16b,v16.16b,v16.16b,#8
245	eor	v0.16b,v1.16b,v18.16b
246	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
247	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
248
249	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
250	pmull	v0.1q,v0.1d,v19.1d
251	eor	v3.16b,v3.16b,v18.16b
252	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
253	eor	v3.16b,v3.16b,v0.16b
254	pmull2	v6.1q,v20.2d,v7.2d
255	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
256
257	eor	v2.16b,v2.16b,v18.16b
258	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
259	adds	x3,x3,#32		//re-construct x3
260	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
261	b.eq	Ldone_v8		//is x3 zero?
262Lodd_tail_v8:
263	ext	v18.16b,v0.16b,v0.16b,#8
264	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
265	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
266
267	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
268	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
269	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
270	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
271
272	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
273	eor	v18.16b,v0.16b,v2.16b
274	eor	v1.16b,v1.16b,v17.16b
275	eor	v1.16b,v1.16b,v18.16b
276	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
277
278	ins	v2.d[0],v1.d[1]
279	ins	v1.d[1],v0.d[0]
280	eor	v0.16b,v1.16b,v18.16b
281
282	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
283	pmull	v0.1q,v0.1d,v19.1d
284	eor	v18.16b,v18.16b,v2.16b
285	eor	v0.16b,v0.16b,v18.16b
286
287Ldone_v8:
288#ifndef __ARMEB__
289	rev64	v0.16b,v0.16b
290#endif
291	ext	v0.16b,v0.16b,v0.16b,#8
292	st1	{v0.2d},[x0]		//write out Xi
293
294	ret
295
296.def gcm_ghash_v8_4x
297   .type 32
298.endef
299.align	4
300gcm_ghash_v8_4x:
301Lgcm_ghash_v8_4x:
302	ld1	{v0.2d},[x0]		//load [rotated] Xi
303	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
304	movi	v19.16b,#0xe1
305	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
306	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
307
308	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
309#ifndef __ARMEB__
310	rev64	v0.16b,v0.16b
311	rev64	v5.16b,v5.16b
312	rev64	v6.16b,v6.16b
313	rev64	v7.16b,v7.16b
314	rev64	v4.16b,v4.16b
315#endif
316	ext	v25.16b,v7.16b,v7.16b,#8
317	ext	v24.16b,v6.16b,v6.16b,#8
318	ext	v23.16b,v5.16b,v5.16b,#8
319
320	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
321	eor	v7.16b,v7.16b,v25.16b
322	pmull2	v31.1q,v20.2d,v25.2d
323	pmull	v30.1q,v21.1d,v7.1d
324
325	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
326	eor	v6.16b,v6.16b,v24.16b
327	pmull2	v24.1q,v22.2d,v24.2d
328	pmull2	v6.1q,v21.2d,v6.2d
329
330	eor	v29.16b,v29.16b,v16.16b
331	eor	v31.16b,v31.16b,v24.16b
332	eor	v30.16b,v30.16b,v6.16b
333
334	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
335	eor	v5.16b,v5.16b,v23.16b
336	pmull2	v23.1q,v26.2d,v23.2d
337	pmull	v5.1q,v27.1d,v5.1d
338
339	eor	v29.16b,v29.16b,v7.16b
340	eor	v31.16b,v31.16b,v23.16b
341	eor	v30.16b,v30.16b,v5.16b
342
343	subs	x3,x3,#128
344	b.lo	Ltail4x
345
346	b	Loop4x
347
348.align	4
349Loop4x:
350	eor	v16.16b,v4.16b,v0.16b
351	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
352	ext	v3.16b,v16.16b,v16.16b,#8
353#ifndef __ARMEB__
354	rev64	v5.16b,v5.16b
355	rev64	v6.16b,v6.16b
356	rev64	v7.16b,v7.16b
357	rev64	v4.16b,v4.16b
358#endif
359
360	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
361	eor	v16.16b,v16.16b,v3.16b
362	pmull2	v2.1q,v28.2d,v3.2d
363	ext	v25.16b,v7.16b,v7.16b,#8
364	pmull2	v1.1q,v27.2d,v16.2d
365
366	eor	v0.16b,v0.16b,v29.16b
367	eor	v2.16b,v2.16b,v31.16b
368	ext	v24.16b,v6.16b,v6.16b,#8
369	eor	v1.16b,v1.16b,v30.16b
370	ext	v23.16b,v5.16b,v5.16b,#8
371
372	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
373	eor	v18.16b,v0.16b,v2.16b
374	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
375	eor	v7.16b,v7.16b,v25.16b
376	eor	v1.16b,v1.16b,v17.16b
377	pmull2	v31.1q,v20.2d,v25.2d
378	eor	v1.16b,v1.16b,v18.16b
379	pmull	v30.1q,v21.1d,v7.1d
380
381	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
382	ins	v2.d[0],v1.d[1]
383	ins	v1.d[1],v0.d[0]
384	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
385	eor	v6.16b,v6.16b,v24.16b
386	pmull2	v24.1q,v22.2d,v24.2d
387	eor	v0.16b,v1.16b,v18.16b
388	pmull2	v6.1q,v21.2d,v6.2d
389
390	eor	v29.16b,v29.16b,v16.16b
391	eor	v31.16b,v31.16b,v24.16b
392	eor	v30.16b,v30.16b,v6.16b
393
394	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
395	pmull	v0.1q,v0.1d,v19.1d
396	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
397	eor	v5.16b,v5.16b,v23.16b
398	eor	v18.16b,v18.16b,v2.16b
399	pmull2	v23.1q,v26.2d,v23.2d
400	pmull	v5.1q,v27.1d,v5.1d
401
402	eor	v0.16b,v0.16b,v18.16b
403	eor	v29.16b,v29.16b,v7.16b
404	eor	v31.16b,v31.16b,v23.16b
405	ext	v0.16b,v0.16b,v0.16b,#8
406	eor	v30.16b,v30.16b,v5.16b
407
408	subs	x3,x3,#64
409	b.hs	Loop4x
410
411Ltail4x:
412	eor	v16.16b,v4.16b,v0.16b
413	ext	v3.16b,v16.16b,v16.16b,#8
414
415	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
416	eor	v16.16b,v16.16b,v3.16b
417	pmull2	v2.1q,v28.2d,v3.2d
418	pmull2	v1.1q,v27.2d,v16.2d
419
420	eor	v0.16b,v0.16b,v29.16b
421	eor	v2.16b,v2.16b,v31.16b
422	eor	v1.16b,v1.16b,v30.16b
423
424	adds	x3,x3,#64
425	b.eq	Ldone4x
426
427	cmp	x3,#32
428	b.lo	Lone
429	b.eq	Ltwo
430Lthree:
431	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
432	eor	v18.16b,v0.16b,v2.16b
433	eor	v1.16b,v1.16b,v17.16b
434	ld1	{v4.2d,v5.2d,v6.2d},[x2]
435	eor	v1.16b,v1.16b,v18.16b
436#ifndef	__ARMEB__
437	rev64	v5.16b,v5.16b
438	rev64	v6.16b,v6.16b
439	rev64	v4.16b,v4.16b
440#endif
441
442	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
443	ins	v2.d[0],v1.d[1]
444	ins	v1.d[1],v0.d[0]
445	ext	v24.16b,v6.16b,v6.16b,#8
446	ext	v23.16b,v5.16b,v5.16b,#8
447	eor	v0.16b,v1.16b,v18.16b
448
449	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
450	eor	v6.16b,v6.16b,v24.16b
451
452	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
453	pmull	v0.1q,v0.1d,v19.1d
454	eor	v18.16b,v18.16b,v2.16b
455	pmull2	v31.1q,v20.2d,v24.2d
456	pmull	v30.1q,v21.1d,v6.1d
457	eor	v0.16b,v0.16b,v18.16b
458	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
459	eor	v5.16b,v5.16b,v23.16b
460	ext	v0.16b,v0.16b,v0.16b,#8
461
462	pmull2	v23.1q,v22.2d,v23.2d
463	eor	v16.16b,v4.16b,v0.16b
464	pmull2	v5.1q,v21.2d,v5.2d
465	ext	v3.16b,v16.16b,v16.16b,#8
466
467	eor	v29.16b,v29.16b,v7.16b
468	eor	v31.16b,v31.16b,v23.16b
469	eor	v30.16b,v30.16b,v5.16b
470
471	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
472	eor	v16.16b,v16.16b,v3.16b
473	pmull2	v2.1q,v26.2d,v3.2d
474	pmull	v1.1q,v27.1d,v16.1d
475
476	eor	v0.16b,v0.16b,v29.16b
477	eor	v2.16b,v2.16b,v31.16b
478	eor	v1.16b,v1.16b,v30.16b
479	b	Ldone4x
480
481.align	4
482Ltwo:
483	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
484	eor	v18.16b,v0.16b,v2.16b
485	eor	v1.16b,v1.16b,v17.16b
486	ld1	{v4.2d,v5.2d},[x2]
487	eor	v1.16b,v1.16b,v18.16b
488#ifndef	__ARMEB__
489	rev64	v5.16b,v5.16b
490	rev64	v4.16b,v4.16b
491#endif
492
493	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
494	ins	v2.d[0],v1.d[1]
495	ins	v1.d[1],v0.d[0]
496	ext	v23.16b,v5.16b,v5.16b,#8
497	eor	v0.16b,v1.16b,v18.16b
498
499	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
500	pmull	v0.1q,v0.1d,v19.1d
501	eor	v18.16b,v18.16b,v2.16b
502	eor	v0.16b,v0.16b,v18.16b
503	ext	v0.16b,v0.16b,v0.16b,#8
504
505	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
506	eor	v5.16b,v5.16b,v23.16b
507
508	eor	v16.16b,v4.16b,v0.16b
509	ext	v3.16b,v16.16b,v16.16b,#8
510
511	pmull2	v31.1q,v20.2d,v23.2d
512	pmull	v30.1q,v21.1d,v5.1d
513
514	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
515	eor	v16.16b,v16.16b,v3.16b
516	pmull2	v2.1q,v22.2d,v3.2d
517	pmull2	v1.1q,v21.2d,v16.2d
518
519	eor	v0.16b,v0.16b,v29.16b
520	eor	v2.16b,v2.16b,v31.16b
521	eor	v1.16b,v1.16b,v30.16b
522	b	Ldone4x
523
524.align	4
525Lone:
526	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
527	eor	v18.16b,v0.16b,v2.16b
528	eor	v1.16b,v1.16b,v17.16b
529	ld1	{v4.2d},[x2]
530	eor	v1.16b,v1.16b,v18.16b
531#ifndef	__ARMEB__
532	rev64	v4.16b,v4.16b
533#endif
534
535	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
536	ins	v2.d[0],v1.d[1]
537	ins	v1.d[1],v0.d[0]
538	eor	v0.16b,v1.16b,v18.16b
539
540	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
541	pmull	v0.1q,v0.1d,v19.1d
542	eor	v18.16b,v18.16b,v2.16b
543	eor	v0.16b,v0.16b,v18.16b
544	ext	v0.16b,v0.16b,v0.16b,#8
545
546	eor	v16.16b,v4.16b,v0.16b
547	ext	v3.16b,v16.16b,v16.16b,#8
548
549	pmull	v0.1q,v20.1d,v3.1d
550	eor	v16.16b,v16.16b,v3.16b
551	pmull2	v2.1q,v20.2d,v3.2d
552	pmull	v1.1q,v21.1d,v16.1d
553
554Ldone4x:
555	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
556	eor	v18.16b,v0.16b,v2.16b
557	eor	v1.16b,v1.16b,v17.16b
558	eor	v1.16b,v1.16b,v18.16b
559
560	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
561	ins	v2.d[0],v1.d[1]
562	ins	v1.d[1],v0.d[0]
563	eor	v0.16b,v1.16b,v18.16b
564
565	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
566	pmull	v0.1q,v0.1d,v19.1d
567	eor	v18.16b,v18.16b,v2.16b
568	eor	v0.16b,v0.16b,v18.16b
569	ext	v0.16b,v0.16b,v0.16b,#8
570
571#ifndef __ARMEB__
572	rev64	v0.16b,v0.16b
573#endif
574	st1	{v0.2d},[x0]		//write out Xi
575
576	ret
577
578.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
579.align	2
580.align	2
581#endif
582#endif
583#endif  // !OPENSSL_NO_ASM
584