• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18#if __ARM_MAX_ARCH__>=7
19.text
20.arch	armv8-a+crypto
21.globl	gcm_init_v8
22.hidden	gcm_init_v8
23.type	gcm_init_v8,%function
24.align	4
25gcm_init_v8:
26	AARCH64_VALID_CALL_TARGET
27	ld1	{v17.2d},[x1]		//load input H
28	movi	v19.16b,#0xe1
29	shl	v19.2d,v19.2d,#57		//0xc2.0
30	ext	v3.16b,v17.16b,v17.16b,#8
31	ushr	v18.2d,v19.2d,#63
32	dup	v17.4s,v17.s[1]
33	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
34	ushr	v18.2d,v3.2d,#63
35	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
36	and	v18.16b,v18.16b,v16.16b
37	shl	v3.2d,v3.2d,#1
38	ext	v18.16b,v18.16b,v18.16b,#8
39	and	v16.16b,v16.16b,v17.16b
40	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
41	eor	v20.16b,v3.16b,v16.16b		//twisted H
42	st1	{v20.2d},[x0],#16		//store Htable[0]
43
44	//calculate H^2
45	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
46	pmull	v0.1q,v20.1d,v20.1d
47	eor	v16.16b,v16.16b,v20.16b
48	pmull2	v2.1q,v20.2d,v20.2d
49	pmull	v1.1q,v16.1d,v16.1d
50
51	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
52	eor	v18.16b,v0.16b,v2.16b
53	eor	v1.16b,v1.16b,v17.16b
54	eor	v1.16b,v1.16b,v18.16b
55	pmull	v18.1q,v0.1d,v19.1d		//1st phase
56
57	ins	v2.d[0],v1.d[1]
58	ins	v1.d[1],v0.d[0]
59	eor	v0.16b,v1.16b,v18.16b
60
61	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
62	pmull	v0.1q,v0.1d,v19.1d
63	eor	v18.16b,v18.16b,v2.16b
64	eor	v22.16b,v0.16b,v18.16b
65
66	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
67	eor	v17.16b,v17.16b,v22.16b
68	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
69	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
70	//calculate H^3 and H^4
71	pmull	v0.1q,v20.1d, v22.1d
72	pmull	v5.1q,v22.1d,v22.1d
73	pmull2	v2.1q,v20.2d, v22.2d
74	pmull2	v7.1q,v22.2d,v22.2d
75	pmull	v1.1q,v16.1d,v17.1d
76	pmull	v6.1q,v17.1d,v17.1d
77
78	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
79	ext	v17.16b,v5.16b,v7.16b,#8
80	eor	v18.16b,v0.16b,v2.16b
81	eor	v1.16b,v1.16b,v16.16b
82	eor	v4.16b,v5.16b,v7.16b
83	eor	v6.16b,v6.16b,v17.16b
84	eor	v1.16b,v1.16b,v18.16b
85	pmull	v18.1q,v0.1d,v19.1d		//1st phase
86	eor	v6.16b,v6.16b,v4.16b
87	pmull	v4.1q,v5.1d,v19.1d
88
89	ins	v2.d[0],v1.d[1]
90	ins	v7.d[0],v6.d[1]
91	ins	v1.d[1],v0.d[0]
92	ins	v6.d[1],v5.d[0]
93	eor	v0.16b,v1.16b,v18.16b
94	eor	v5.16b,v6.16b,v4.16b
95
96	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
97	ext	v4.16b,v5.16b,v5.16b,#8
98	pmull	v0.1q,v0.1d,v19.1d
99	pmull	v5.1q,v5.1d,v19.1d
100	eor	v18.16b,v18.16b,v2.16b
101	eor	v4.16b,v4.16b,v7.16b
102	eor	v20.16b, v0.16b,v18.16b		//H^3
103	eor	v22.16b,v5.16b,v4.16b		//H^4
104
105	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
106	ext	v17.16b,v22.16b,v22.16b,#8
107	eor	v16.16b,v16.16b,v20.16b
108	eor	v17.16b,v17.16b,v22.16b
109	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
110	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
111	ret
112.size	gcm_init_v8,.-gcm_init_v8
113.globl	gcm_gmult_v8
114.hidden	gcm_gmult_v8
115.type	gcm_gmult_v8,%function
116.align	4
117gcm_gmult_v8:
118	AARCH64_VALID_CALL_TARGET
119	ld1	{v17.2d},[x0]		//load Xi
120	movi	v19.16b,#0xe1
121	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
122	shl	v19.2d,v19.2d,#57
123#ifndef __ARMEB__
124	rev64	v17.16b,v17.16b
125#endif
126	ext	v3.16b,v17.16b,v17.16b,#8
127
128	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
129	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
130	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
131	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
132
133	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
134	eor	v18.16b,v0.16b,v2.16b
135	eor	v1.16b,v1.16b,v17.16b
136	eor	v1.16b,v1.16b,v18.16b
137	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
138
139	ins	v2.d[0],v1.d[1]
140	ins	v1.d[1],v0.d[0]
141	eor	v0.16b,v1.16b,v18.16b
142
143	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
144	pmull	v0.1q,v0.1d,v19.1d
145	eor	v18.16b,v18.16b,v2.16b
146	eor	v0.16b,v0.16b,v18.16b
147
148#ifndef __ARMEB__
149	rev64	v0.16b,v0.16b
150#endif
151	ext	v0.16b,v0.16b,v0.16b,#8
152	st1	{v0.2d},[x0]		//write out Xi
153
154	ret
155.size	gcm_gmult_v8,.-gcm_gmult_v8
156.globl	gcm_ghash_v8
157.hidden	gcm_ghash_v8
158.type	gcm_ghash_v8,%function
159.align	4
160gcm_ghash_v8:
161	AARCH64_VALID_CALL_TARGET
162	cmp	x3,#64
163	b.hs	.Lgcm_ghash_v8_4x
164	ld1	{v0.2d},[x0]		//load [rotated] Xi
165						//"[rotated]" means that
166						//loaded value would have
167						//to be rotated in order to
168						//make it appear as in
169						//algorithm specification
170	subs	x3,x3,#32		//see if x3 is 32 or larger
171	mov	x12,#16		//x12 is used as post-
172						//increment for input pointer;
173						//as loop is modulo-scheduled
174						//x12 is zeroed just in time
175						//to preclude overstepping
176						//inp[len], which means that
177						//last block[s] are actually
178						//loaded twice, but last
179						//copy is not processed
180	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
181	movi	v19.16b,#0xe1
182	ld1	{v22.2d},[x1]
183	csel	x12,xzr,x12,eq			//is it time to zero x12?
184	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
185	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
186	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
187#ifndef __ARMEB__
188	rev64	v16.16b,v16.16b
189	rev64	v0.16b,v0.16b
190#endif
191	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
192	b.lo	.Lodd_tail_v8		//x3 was less than 32
193	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
194#ifndef __ARMEB__
195	rev64	v17.16b,v17.16b
196#endif
197	ext	v7.16b,v17.16b,v17.16b,#8
198	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
199	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
200	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
201	pmull2	v6.1q,v20.2d,v7.2d
202	b	.Loop_mod2x_v8
203
204.align	4
205.Loop_mod2x_v8:
206	ext	v18.16b,v3.16b,v3.16b,#8
207	subs	x3,x3,#32		//is there more data?
208	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
209	csel	x12,xzr,x12,lo			//is it time to zero x12?
210
211	pmull	v5.1q,v21.1d,v17.1d
212	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
213	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
214	eor	v0.16b,v0.16b,v4.16b		//accumulate
215	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
216	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
217
218	eor	v2.16b,v2.16b,v6.16b
219	csel	x12,xzr,x12,eq			//is it time to zero x12?
220	eor	v1.16b,v1.16b,v5.16b
221
222	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
223	eor	v18.16b,v0.16b,v2.16b
224	eor	v1.16b,v1.16b,v17.16b
225	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
226#ifndef __ARMEB__
227	rev64	v16.16b,v16.16b
228#endif
229	eor	v1.16b,v1.16b,v18.16b
230	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
231
232#ifndef __ARMEB__
233	rev64	v17.16b,v17.16b
234#endif
235	ins	v2.d[0],v1.d[1]
236	ins	v1.d[1],v0.d[0]
237	ext	v7.16b,v17.16b,v17.16b,#8
238	ext	v3.16b,v16.16b,v16.16b,#8
239	eor	v0.16b,v1.16b,v18.16b
240	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
241	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
242
243	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
244	pmull	v0.1q,v0.1d,v19.1d
245	eor	v3.16b,v3.16b,v18.16b
246	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
247	eor	v3.16b,v3.16b,v0.16b
248	pmull2	v6.1q,v20.2d,v7.2d
249	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
250
251	eor	v2.16b,v2.16b,v18.16b
252	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
253	adds	x3,x3,#32		//re-construct x3
254	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
255	b.eq	.Ldone_v8		//is x3 zero?
256.Lodd_tail_v8:
257	ext	v18.16b,v0.16b,v0.16b,#8
258	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
259	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
260
261	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
262	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
263	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
264	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
265
266	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
267	eor	v18.16b,v0.16b,v2.16b
268	eor	v1.16b,v1.16b,v17.16b
269	eor	v1.16b,v1.16b,v18.16b
270	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
271
272	ins	v2.d[0],v1.d[1]
273	ins	v1.d[1],v0.d[0]
274	eor	v0.16b,v1.16b,v18.16b
275
276	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
277	pmull	v0.1q,v0.1d,v19.1d
278	eor	v18.16b,v18.16b,v2.16b
279	eor	v0.16b,v0.16b,v18.16b
280
281.Ldone_v8:
282#ifndef __ARMEB__
283	rev64	v0.16b,v0.16b
284#endif
285	ext	v0.16b,v0.16b,v0.16b,#8
286	st1	{v0.2d},[x0]		//write out Xi
287
288	ret
289.size	gcm_ghash_v8,.-gcm_ghash_v8
290.type	gcm_ghash_v8_4x,%function
291.align	4
292gcm_ghash_v8_4x:
293.Lgcm_ghash_v8_4x:
294	ld1	{v0.2d},[x0]		//load [rotated] Xi
295	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
296	movi	v19.16b,#0xe1
297	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
298	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
299
300	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
301#ifndef __ARMEB__
302	rev64	v0.16b,v0.16b
303	rev64	v5.16b,v5.16b
304	rev64	v6.16b,v6.16b
305	rev64	v7.16b,v7.16b
306	rev64	v4.16b,v4.16b
307#endif
308	ext	v25.16b,v7.16b,v7.16b,#8
309	ext	v24.16b,v6.16b,v6.16b,#8
310	ext	v23.16b,v5.16b,v5.16b,#8
311
312	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
313	eor	v7.16b,v7.16b,v25.16b
314	pmull2	v31.1q,v20.2d,v25.2d
315	pmull	v30.1q,v21.1d,v7.1d
316
317	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
318	eor	v6.16b,v6.16b,v24.16b
319	pmull2	v24.1q,v22.2d,v24.2d
320	pmull2	v6.1q,v21.2d,v6.2d
321
322	eor	v29.16b,v29.16b,v16.16b
323	eor	v31.16b,v31.16b,v24.16b
324	eor	v30.16b,v30.16b,v6.16b
325
326	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
327	eor	v5.16b,v5.16b,v23.16b
328	pmull2	v23.1q,v26.2d,v23.2d
329	pmull	v5.1q,v27.1d,v5.1d
330
331	eor	v29.16b,v29.16b,v7.16b
332	eor	v31.16b,v31.16b,v23.16b
333	eor	v30.16b,v30.16b,v5.16b
334
335	subs	x3,x3,#128
336	b.lo	.Ltail4x
337
338	b	.Loop4x
339
340.align	4
341.Loop4x:
342	eor	v16.16b,v4.16b,v0.16b
343	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
344	ext	v3.16b,v16.16b,v16.16b,#8
345#ifndef __ARMEB__
346	rev64	v5.16b,v5.16b
347	rev64	v6.16b,v6.16b
348	rev64	v7.16b,v7.16b
349	rev64	v4.16b,v4.16b
350#endif
351
352	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
353	eor	v16.16b,v16.16b,v3.16b
354	pmull2	v2.1q,v28.2d,v3.2d
355	ext	v25.16b,v7.16b,v7.16b,#8
356	pmull2	v1.1q,v27.2d,v16.2d
357
358	eor	v0.16b,v0.16b,v29.16b
359	eor	v2.16b,v2.16b,v31.16b
360	ext	v24.16b,v6.16b,v6.16b,#8
361	eor	v1.16b,v1.16b,v30.16b
362	ext	v23.16b,v5.16b,v5.16b,#8
363
364	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
365	eor	v18.16b,v0.16b,v2.16b
366	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
367	eor	v7.16b,v7.16b,v25.16b
368	eor	v1.16b,v1.16b,v17.16b
369	pmull2	v31.1q,v20.2d,v25.2d
370	eor	v1.16b,v1.16b,v18.16b
371	pmull	v30.1q,v21.1d,v7.1d
372
373	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
374	ins	v2.d[0],v1.d[1]
375	ins	v1.d[1],v0.d[0]
376	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
377	eor	v6.16b,v6.16b,v24.16b
378	pmull2	v24.1q,v22.2d,v24.2d
379	eor	v0.16b,v1.16b,v18.16b
380	pmull2	v6.1q,v21.2d,v6.2d
381
382	eor	v29.16b,v29.16b,v16.16b
383	eor	v31.16b,v31.16b,v24.16b
384	eor	v30.16b,v30.16b,v6.16b
385
386	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
387	pmull	v0.1q,v0.1d,v19.1d
388	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
389	eor	v5.16b,v5.16b,v23.16b
390	eor	v18.16b,v18.16b,v2.16b
391	pmull2	v23.1q,v26.2d,v23.2d
392	pmull	v5.1q,v27.1d,v5.1d
393
394	eor	v0.16b,v0.16b,v18.16b
395	eor	v29.16b,v29.16b,v7.16b
396	eor	v31.16b,v31.16b,v23.16b
397	ext	v0.16b,v0.16b,v0.16b,#8
398	eor	v30.16b,v30.16b,v5.16b
399
400	subs	x3,x3,#64
401	b.hs	.Loop4x
402
403.Ltail4x:
404	eor	v16.16b,v4.16b,v0.16b
405	ext	v3.16b,v16.16b,v16.16b,#8
406
407	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
408	eor	v16.16b,v16.16b,v3.16b
409	pmull2	v2.1q,v28.2d,v3.2d
410	pmull2	v1.1q,v27.2d,v16.2d
411
412	eor	v0.16b,v0.16b,v29.16b
413	eor	v2.16b,v2.16b,v31.16b
414	eor	v1.16b,v1.16b,v30.16b
415
416	adds	x3,x3,#64
417	b.eq	.Ldone4x
418
419	cmp	x3,#32
420	b.lo	.Lone
421	b.eq	.Ltwo
422.Lthree:
423	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
424	eor	v18.16b,v0.16b,v2.16b
425	eor	v1.16b,v1.16b,v17.16b
426	ld1	{v4.2d,v5.2d,v6.2d},[x2]
427	eor	v1.16b,v1.16b,v18.16b
428#ifndef	__ARMEB__
429	rev64	v5.16b,v5.16b
430	rev64	v6.16b,v6.16b
431	rev64	v4.16b,v4.16b
432#endif
433
434	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
435	ins	v2.d[0],v1.d[1]
436	ins	v1.d[1],v0.d[0]
437	ext	v24.16b,v6.16b,v6.16b,#8
438	ext	v23.16b,v5.16b,v5.16b,#8
439	eor	v0.16b,v1.16b,v18.16b
440
441	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
442	eor	v6.16b,v6.16b,v24.16b
443
444	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
445	pmull	v0.1q,v0.1d,v19.1d
446	eor	v18.16b,v18.16b,v2.16b
447	pmull2	v31.1q,v20.2d,v24.2d
448	pmull	v30.1q,v21.1d,v6.1d
449	eor	v0.16b,v0.16b,v18.16b
450	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
451	eor	v5.16b,v5.16b,v23.16b
452	ext	v0.16b,v0.16b,v0.16b,#8
453
454	pmull2	v23.1q,v22.2d,v23.2d
455	eor	v16.16b,v4.16b,v0.16b
456	pmull2	v5.1q,v21.2d,v5.2d
457	ext	v3.16b,v16.16b,v16.16b,#8
458
459	eor	v29.16b,v29.16b,v7.16b
460	eor	v31.16b,v31.16b,v23.16b
461	eor	v30.16b,v30.16b,v5.16b
462
463	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
464	eor	v16.16b,v16.16b,v3.16b
465	pmull2	v2.1q,v26.2d,v3.2d
466	pmull	v1.1q,v27.1d,v16.1d
467
468	eor	v0.16b,v0.16b,v29.16b
469	eor	v2.16b,v2.16b,v31.16b
470	eor	v1.16b,v1.16b,v30.16b
471	b	.Ldone4x
472
473.align	4
474.Ltwo:
475	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
476	eor	v18.16b,v0.16b,v2.16b
477	eor	v1.16b,v1.16b,v17.16b
478	ld1	{v4.2d,v5.2d},[x2]
479	eor	v1.16b,v1.16b,v18.16b
480#ifndef	__ARMEB__
481	rev64	v5.16b,v5.16b
482	rev64	v4.16b,v4.16b
483#endif
484
485	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
486	ins	v2.d[0],v1.d[1]
487	ins	v1.d[1],v0.d[0]
488	ext	v23.16b,v5.16b,v5.16b,#8
489	eor	v0.16b,v1.16b,v18.16b
490
491	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
492	pmull	v0.1q,v0.1d,v19.1d
493	eor	v18.16b,v18.16b,v2.16b
494	eor	v0.16b,v0.16b,v18.16b
495	ext	v0.16b,v0.16b,v0.16b,#8
496
497	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
498	eor	v5.16b,v5.16b,v23.16b
499
500	eor	v16.16b,v4.16b,v0.16b
501	ext	v3.16b,v16.16b,v16.16b,#8
502
503	pmull2	v31.1q,v20.2d,v23.2d
504	pmull	v30.1q,v21.1d,v5.1d
505
506	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
507	eor	v16.16b,v16.16b,v3.16b
508	pmull2	v2.1q,v22.2d,v3.2d
509	pmull2	v1.1q,v21.2d,v16.2d
510
511	eor	v0.16b,v0.16b,v29.16b
512	eor	v2.16b,v2.16b,v31.16b
513	eor	v1.16b,v1.16b,v30.16b
514	b	.Ldone4x
515
516.align	4
517.Lone:
518	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
519	eor	v18.16b,v0.16b,v2.16b
520	eor	v1.16b,v1.16b,v17.16b
521	ld1	{v4.2d},[x2]
522	eor	v1.16b,v1.16b,v18.16b
523#ifndef	__ARMEB__
524	rev64	v4.16b,v4.16b
525#endif
526
527	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
528	ins	v2.d[0],v1.d[1]
529	ins	v1.d[1],v0.d[0]
530	eor	v0.16b,v1.16b,v18.16b
531
532	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
533	pmull	v0.1q,v0.1d,v19.1d
534	eor	v18.16b,v18.16b,v2.16b
535	eor	v0.16b,v0.16b,v18.16b
536	ext	v0.16b,v0.16b,v0.16b,#8
537
538	eor	v16.16b,v4.16b,v0.16b
539	ext	v3.16b,v16.16b,v16.16b,#8
540
541	pmull	v0.1q,v20.1d,v3.1d
542	eor	v16.16b,v16.16b,v3.16b
543	pmull2	v2.1q,v20.2d,v3.2d
544	pmull	v1.1q,v21.1d,v16.1d
545
546.Ldone4x:
547	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
548	eor	v18.16b,v0.16b,v2.16b
549	eor	v1.16b,v1.16b,v17.16b
550	eor	v1.16b,v1.16b,v18.16b
551
552	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
553	ins	v2.d[0],v1.d[1]
554	ins	v1.d[1],v0.d[0]
555	eor	v0.16b,v1.16b,v18.16b
556
557	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
558	pmull	v0.1q,v0.1d,v19.1d
559	eor	v18.16b,v18.16b,v2.16b
560	eor	v0.16b,v0.16b,v18.16b
561	ext	v0.16b,v0.16b,v0.16b,#8
562
563#ifndef __ARMEB__
564	rev64	v0.16b,v0.16b
565#endif
566	st1	{v0.2d},[x0]		//write out Xi
567
568	ret
569.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
570.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
571.align	2
572.align	2
573#endif
574#endif
575#endif  // !OPENSSL_NO_ASM
576.section	.note.GNU-stack,"",%progbits
577