• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3#if __ARM_MAX_ARCH__>=7
4
5.text
6.globl	_gcm_init_v8
7
8.align	4
9_gcm_init_v8:
10	ld1	{v17.2d},[x1]		//load input H
11	movi	v19.16b,#0xe1
12	shl	v19.2d,v19.2d,#57		//0xc2.0
13	ext	v3.16b,v17.16b,v17.16b,#8
14	ushr	v18.2d,v19.2d,#63
15	dup	v17.4s,v17.s[1]
16	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
17	ushr	v18.2d,v3.2d,#63
18	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
19	and	v18.16b,v18.16b,v16.16b
20	shl	v3.2d,v3.2d,#1
21	ext	v18.16b,v18.16b,v18.16b,#8
22	and	v16.16b,v16.16b,v17.16b
23	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
24	eor	v20.16b,v3.16b,v16.16b		//twisted H
25	st1	{v20.2d},[x0],#16		//store Htable[0]
26
27	//calculate H^2
28	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
29	pmull	v0.1q,v20.1d,v20.1d
30	eor	v16.16b,v16.16b,v20.16b
31	pmull2	v2.1q,v20.2d,v20.2d
32	pmull	v1.1q,v16.1d,v16.1d
33
34	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
35	eor	v18.16b,v0.16b,v2.16b
36	eor	v1.16b,v1.16b,v17.16b
37	eor	v1.16b,v1.16b,v18.16b
38	pmull	v18.1q,v0.1d,v19.1d		//1st phase
39
40	ins	v2.d[0],v1.d[1]
41	ins	v1.d[1],v0.d[0]
42	eor	v0.16b,v1.16b,v18.16b
43
44	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
45	pmull	v0.1q,v0.1d,v19.1d
46	eor	v18.16b,v18.16b,v2.16b
47	eor	v22.16b,v0.16b,v18.16b
48
49	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
50	eor	v17.16b,v17.16b,v22.16b
51	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
52	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
53	//calculate H^3 and H^4
54	pmull	v0.1q,v20.1d, v22.1d
55	pmull	v5.1q,v22.1d,v22.1d
56	pmull2	v2.1q,v20.2d, v22.2d
57	pmull2	v7.1q,v22.2d,v22.2d
58	pmull	v1.1q,v16.1d,v17.1d
59	pmull	v6.1q,v17.1d,v17.1d
60
61	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
62	ext	v17.16b,v5.16b,v7.16b,#8
63	eor	v18.16b,v0.16b,v2.16b
64	eor	v1.16b,v1.16b,v16.16b
65	eor	v4.16b,v5.16b,v7.16b
66	eor	v6.16b,v6.16b,v17.16b
67	eor	v1.16b,v1.16b,v18.16b
68	pmull	v18.1q,v0.1d,v19.1d		//1st phase
69	eor	v6.16b,v6.16b,v4.16b
70	pmull	v4.1q,v5.1d,v19.1d
71
72	ins	v2.d[0],v1.d[1]
73	ins	v7.d[0],v6.d[1]
74	ins	v1.d[1],v0.d[0]
75	ins	v6.d[1],v5.d[0]
76	eor	v0.16b,v1.16b,v18.16b
77	eor	v5.16b,v6.16b,v4.16b
78
79	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
80	ext	v4.16b,v5.16b,v5.16b,#8
81	pmull	v0.1q,v0.1d,v19.1d
82	pmull	v5.1q,v5.1d,v19.1d
83	eor	v18.16b,v18.16b,v2.16b
84	eor	v4.16b,v4.16b,v7.16b
85	eor	v20.16b, v0.16b,v18.16b		//H^3
86	eor	v22.16b,v5.16b,v4.16b		//H^4
87
88	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
89	ext	v17.16b,v22.16b,v22.16b,#8
90	eor	v16.16b,v16.16b,v20.16b
91	eor	v17.16b,v17.16b,v22.16b
92	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
93	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
94	ret
95
96.globl	_gcm_gmult_v8
97
98.align	4
99_gcm_gmult_v8:
100	ld1	{v17.2d},[x0]		//load Xi
101	movi	v19.16b,#0xe1
102	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
103	shl	v19.2d,v19.2d,#57
104#ifndef __AARCH64EB__
105	rev64	v17.16b,v17.16b
106#endif
107	ext	v3.16b,v17.16b,v17.16b,#8
108
109	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
110	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
111	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
112	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
113
114	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
115	eor	v18.16b,v0.16b,v2.16b
116	eor	v1.16b,v1.16b,v17.16b
117	eor	v1.16b,v1.16b,v18.16b
118	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
119
120	ins	v2.d[0],v1.d[1]
121	ins	v1.d[1],v0.d[0]
122	eor	v0.16b,v1.16b,v18.16b
123
124	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
125	pmull	v0.1q,v0.1d,v19.1d
126	eor	v18.16b,v18.16b,v2.16b
127	eor	v0.16b,v0.16b,v18.16b
128
129#ifndef __AARCH64EB__
130	rev64	v0.16b,v0.16b
131#endif
132	ext	v0.16b,v0.16b,v0.16b,#8
133	st1	{v0.2d},[x0]		//write out Xi
134
135	ret
136
137.globl	_gcm_ghash_v8
138
139.align	4
140_gcm_ghash_v8:
141	cmp	x3,#64
142	b.hs	Lgcm_ghash_v8_4x
143	ld1	{v0.2d},[x0]		//load [rotated] Xi
144						//"[rotated]" means that
145						//loaded value would have
146						//to be rotated in order to
147						//make it appear as in
148						//algorithm specification
149	subs	x3,x3,#32		//see if x3 is 32 or larger
150	mov	x12,#16		//x12 is used as post-
151						//increment for input pointer;
152						//as loop is modulo-scheduled
153						//x12 is zeroed just in time
154						//to preclude overstepping
155						//inp[len], which means that
156						//last block[s] are actually
157						//loaded twice, but last
158						//copy is not processed
159	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
160	movi	v19.16b,#0xe1
161	ld1	{v22.2d},[x1]
162	csel	x12,xzr,x12,eq			//is it time to zero x12?
163	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
164	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
165	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
166#ifndef __AARCH64EB__
167	rev64	v16.16b,v16.16b
168	rev64	v0.16b,v0.16b
169#endif
170	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
171	b.lo	Lodd_tail_v8		//x3 was less than 32
172	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
173#ifndef __AARCH64EB__
174	rev64	v17.16b,v17.16b
175#endif
176	ext	v7.16b,v17.16b,v17.16b,#8
177	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
178	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
179	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
180	pmull2	v6.1q,v20.2d,v7.2d
181	b	Loop_mod2x_v8
182
183.align	4
184Loop_mod2x_v8:
185	ext	v18.16b,v3.16b,v3.16b,#8
186	subs	x3,x3,#32		//is there more data?
187	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
188	csel	x12,xzr,x12,lo			//is it time to zero x12?
189
190	pmull	v5.1q,v21.1d,v17.1d
191	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
192	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
193	eor	v0.16b,v0.16b,v4.16b		//accumulate
194	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
195	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
196
197	eor	v2.16b,v2.16b,v6.16b
198	csel	x12,xzr,x12,eq			//is it time to zero x12?
199	eor	v1.16b,v1.16b,v5.16b
200
201	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
202	eor	v18.16b,v0.16b,v2.16b
203	eor	v1.16b,v1.16b,v17.16b
204	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
205#ifndef __AARCH64EB__
206	rev64	v16.16b,v16.16b
207#endif
208	eor	v1.16b,v1.16b,v18.16b
209	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
210
211#ifndef __AARCH64EB__
212	rev64	v17.16b,v17.16b
213#endif
214	ins	v2.d[0],v1.d[1]
215	ins	v1.d[1],v0.d[0]
216	ext	v7.16b,v17.16b,v17.16b,#8
217	ext	v3.16b,v16.16b,v16.16b,#8
218	eor	v0.16b,v1.16b,v18.16b
219	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
220	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
221
222	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
223	pmull	v0.1q,v0.1d,v19.1d
224	eor	v3.16b,v3.16b,v18.16b
225	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
226	eor	v3.16b,v3.16b,v0.16b
227	pmull2	v6.1q,v20.2d,v7.2d
228	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
229
230	eor	v2.16b,v2.16b,v18.16b
231	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
232	adds	x3,x3,#32		//re-construct x3
233	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
234	b.eq	Ldone_v8		//is x3 zero?
235Lodd_tail_v8:
236	ext	v18.16b,v0.16b,v0.16b,#8
237	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
238	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
239
240	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
241	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
242	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
243	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
244
245	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
246	eor	v18.16b,v0.16b,v2.16b
247	eor	v1.16b,v1.16b,v17.16b
248	eor	v1.16b,v1.16b,v18.16b
249	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
250
251	ins	v2.d[0],v1.d[1]
252	ins	v1.d[1],v0.d[0]
253	eor	v0.16b,v1.16b,v18.16b
254
255	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
256	pmull	v0.1q,v0.1d,v19.1d
257	eor	v18.16b,v18.16b,v2.16b
258	eor	v0.16b,v0.16b,v18.16b
259
260Ldone_v8:
261#ifndef __AARCH64EB__
262	rev64	v0.16b,v0.16b
263#endif
264	ext	v0.16b,v0.16b,v0.16b,#8
265	st1	{v0.2d},[x0]		//write out Xi
266
267	ret
268
269
270.align	4
271gcm_ghash_v8_4x:
272Lgcm_ghash_v8_4x:
273	ld1	{v0.2d},[x0]		//load [rotated] Xi
274	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
275	movi	v19.16b,#0xe1
276	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
277	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
278
279	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
280#ifndef __AARCH64EB__
281	rev64	v0.16b,v0.16b
282	rev64	v5.16b,v5.16b
283	rev64	v6.16b,v6.16b
284	rev64	v7.16b,v7.16b
285	rev64	v4.16b,v4.16b
286#endif
287	ext	v25.16b,v7.16b,v7.16b,#8
288	ext	v24.16b,v6.16b,v6.16b,#8
289	ext	v23.16b,v5.16b,v5.16b,#8
290
291	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
292	eor	v7.16b,v7.16b,v25.16b
293	pmull2	v31.1q,v20.2d,v25.2d
294	pmull	v30.1q,v21.1d,v7.1d
295
296	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
297	eor	v6.16b,v6.16b,v24.16b
298	pmull2	v24.1q,v22.2d,v24.2d
299	pmull2	v6.1q,v21.2d,v6.2d
300
301	eor	v29.16b,v29.16b,v16.16b
302	eor	v31.16b,v31.16b,v24.16b
303	eor	v30.16b,v30.16b,v6.16b
304
305	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
306	eor	v5.16b,v5.16b,v23.16b
307	pmull2	v23.1q,v26.2d,v23.2d
308	pmull	v5.1q,v27.1d,v5.1d
309
310	eor	v29.16b,v29.16b,v7.16b
311	eor	v31.16b,v31.16b,v23.16b
312	eor	v30.16b,v30.16b,v5.16b
313
314	subs	x3,x3,#128
315	b.lo	Ltail4x
316
317	b	Loop4x
318
319.align	4
320Loop4x:
321	eor	v16.16b,v4.16b,v0.16b
322	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
323	ext	v3.16b,v16.16b,v16.16b,#8
324#ifndef __AARCH64EB__
325	rev64	v5.16b,v5.16b
326	rev64	v6.16b,v6.16b
327	rev64	v7.16b,v7.16b
328	rev64	v4.16b,v4.16b
329#endif
330
331	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
332	eor	v16.16b,v16.16b,v3.16b
333	pmull2	v2.1q,v28.2d,v3.2d
334	ext	v25.16b,v7.16b,v7.16b,#8
335	pmull2	v1.1q,v27.2d,v16.2d
336
337	eor	v0.16b,v0.16b,v29.16b
338	eor	v2.16b,v2.16b,v31.16b
339	ext	v24.16b,v6.16b,v6.16b,#8
340	eor	v1.16b,v1.16b,v30.16b
341	ext	v23.16b,v5.16b,v5.16b,#8
342
343	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
344	eor	v18.16b,v0.16b,v2.16b
345	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
346	eor	v7.16b,v7.16b,v25.16b
347	eor	v1.16b,v1.16b,v17.16b
348	pmull2	v31.1q,v20.2d,v25.2d
349	eor	v1.16b,v1.16b,v18.16b
350	pmull	v30.1q,v21.1d,v7.1d
351
352	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
353	ins	v2.d[0],v1.d[1]
354	ins	v1.d[1],v0.d[0]
355	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
356	eor	v6.16b,v6.16b,v24.16b
357	pmull2	v24.1q,v22.2d,v24.2d
358	eor	v0.16b,v1.16b,v18.16b
359	pmull2	v6.1q,v21.2d,v6.2d
360
361	eor	v29.16b,v29.16b,v16.16b
362	eor	v31.16b,v31.16b,v24.16b
363	eor	v30.16b,v30.16b,v6.16b
364
365	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
366	pmull	v0.1q,v0.1d,v19.1d
367	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
368	eor	v5.16b,v5.16b,v23.16b
369	eor	v18.16b,v18.16b,v2.16b
370	pmull2	v23.1q,v26.2d,v23.2d
371	pmull	v5.1q,v27.1d,v5.1d
372
373	eor	v0.16b,v0.16b,v18.16b
374	eor	v29.16b,v29.16b,v7.16b
375	eor	v31.16b,v31.16b,v23.16b
376	ext	v0.16b,v0.16b,v0.16b,#8
377	eor	v30.16b,v30.16b,v5.16b
378
379	subs	x3,x3,#64
380	b.hs	Loop4x
381
382Ltail4x:
383	eor	v16.16b,v4.16b,v0.16b
384	ext	v3.16b,v16.16b,v16.16b,#8
385
386	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
387	eor	v16.16b,v16.16b,v3.16b
388	pmull2	v2.1q,v28.2d,v3.2d
389	pmull2	v1.1q,v27.2d,v16.2d
390
391	eor	v0.16b,v0.16b,v29.16b
392	eor	v2.16b,v2.16b,v31.16b
393	eor	v1.16b,v1.16b,v30.16b
394
395	adds	x3,x3,#64
396	b.eq	Ldone4x
397
398	cmp	x3,#32
399	b.lo	Lone
400	b.eq	Ltwo
401Lthree:
402	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
403	eor	v18.16b,v0.16b,v2.16b
404	eor	v1.16b,v1.16b,v17.16b
405	ld1	{v4.2d,v5.2d,v6.2d},[x2]
406	eor	v1.16b,v1.16b,v18.16b
407#ifndef	__AARCH64EB__
408	rev64	v5.16b,v5.16b
409	rev64	v6.16b,v6.16b
410	rev64	v4.16b,v4.16b
411#endif
412
413	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
414	ins	v2.d[0],v1.d[1]
415	ins	v1.d[1],v0.d[0]
416	ext	v24.16b,v6.16b,v6.16b,#8
417	ext	v23.16b,v5.16b,v5.16b,#8
418	eor	v0.16b,v1.16b,v18.16b
419
420	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
421	eor	v6.16b,v6.16b,v24.16b
422
423	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
424	pmull	v0.1q,v0.1d,v19.1d
425	eor	v18.16b,v18.16b,v2.16b
426	pmull2	v31.1q,v20.2d,v24.2d
427	pmull	v30.1q,v21.1d,v6.1d
428	eor	v0.16b,v0.16b,v18.16b
429	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
430	eor	v5.16b,v5.16b,v23.16b
431	ext	v0.16b,v0.16b,v0.16b,#8
432
433	pmull2	v23.1q,v22.2d,v23.2d
434	eor	v16.16b,v4.16b,v0.16b
435	pmull2	v5.1q,v21.2d,v5.2d
436	ext	v3.16b,v16.16b,v16.16b,#8
437
438	eor	v29.16b,v29.16b,v7.16b
439	eor	v31.16b,v31.16b,v23.16b
440	eor	v30.16b,v30.16b,v5.16b
441
442	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
443	eor	v16.16b,v16.16b,v3.16b
444	pmull2	v2.1q,v26.2d,v3.2d
445	pmull	v1.1q,v27.1d,v16.1d
446
447	eor	v0.16b,v0.16b,v29.16b
448	eor	v2.16b,v2.16b,v31.16b
449	eor	v1.16b,v1.16b,v30.16b
450	b	Ldone4x
451
452.align	4
453Ltwo:
454	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
455	eor	v18.16b,v0.16b,v2.16b
456	eor	v1.16b,v1.16b,v17.16b
457	ld1	{v4.2d,v5.2d},[x2]
458	eor	v1.16b,v1.16b,v18.16b
459#ifndef	__AARCH64EB__
460	rev64	v5.16b,v5.16b
461	rev64	v4.16b,v4.16b
462#endif
463
464	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
465	ins	v2.d[0],v1.d[1]
466	ins	v1.d[1],v0.d[0]
467	ext	v23.16b,v5.16b,v5.16b,#8
468	eor	v0.16b,v1.16b,v18.16b
469
470	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
471	pmull	v0.1q,v0.1d,v19.1d
472	eor	v18.16b,v18.16b,v2.16b
473	eor	v0.16b,v0.16b,v18.16b
474	ext	v0.16b,v0.16b,v0.16b,#8
475
476	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
477	eor	v5.16b,v5.16b,v23.16b
478
479	eor	v16.16b,v4.16b,v0.16b
480	ext	v3.16b,v16.16b,v16.16b,#8
481
482	pmull2	v31.1q,v20.2d,v23.2d
483	pmull	v30.1q,v21.1d,v5.1d
484
485	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
486	eor	v16.16b,v16.16b,v3.16b
487	pmull2	v2.1q,v22.2d,v3.2d
488	pmull2	v1.1q,v21.2d,v16.2d
489
490	eor	v0.16b,v0.16b,v29.16b
491	eor	v2.16b,v2.16b,v31.16b
492	eor	v1.16b,v1.16b,v30.16b
493	b	Ldone4x
494
495.align	4
496Lone:
497	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
498	eor	v18.16b,v0.16b,v2.16b
499	eor	v1.16b,v1.16b,v17.16b
500	ld1	{v4.2d},[x2]
501	eor	v1.16b,v1.16b,v18.16b
502#ifndef	__AARCH64EB__
503	rev64	v4.16b,v4.16b
504#endif
505
506	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
507	ins	v2.d[0],v1.d[1]
508	ins	v1.d[1],v0.d[0]
509	eor	v0.16b,v1.16b,v18.16b
510
511	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
512	pmull	v0.1q,v0.1d,v19.1d
513	eor	v18.16b,v18.16b,v2.16b
514	eor	v0.16b,v0.16b,v18.16b
515	ext	v0.16b,v0.16b,v0.16b,#8
516
517	eor	v16.16b,v4.16b,v0.16b
518	ext	v3.16b,v16.16b,v16.16b,#8
519
520	pmull	v0.1q,v20.1d,v3.1d
521	eor	v16.16b,v16.16b,v3.16b
522	pmull2	v2.1q,v20.2d,v3.2d
523	pmull	v1.1q,v21.1d,v16.1d
524
525Ldone4x:
526	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
527	eor	v18.16b,v0.16b,v2.16b
528	eor	v1.16b,v1.16b,v17.16b
529	eor	v1.16b,v1.16b,v18.16b
530
531	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
532	ins	v2.d[0],v1.d[1]
533	ins	v1.d[1],v0.d[0]
534	eor	v0.16b,v1.16b,v18.16b
535
536	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
537	pmull	v0.1q,v0.1d,v19.1d
538	eor	v18.16b,v18.16b,v2.16b
539	eor	v0.16b,v0.16b,v18.16b
540	ext	v0.16b,v0.16b,v0.16b,#8
541
542#ifndef __AARCH64EB__
543	rev64	v0.16b,v0.16b
544#endif
545	st1	{v0.2d},[x0]		//write out Xi
546
547	ret
548
549.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
550.align	2
551.align	2
552#endif
553