• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
2//
3// Licensed under the OpenSSL license (the "License").  You may not use
4// this file except in compliance with the License.  You can obtain a copy
5// in the file LICENSE in the source distribution or at
6// https://www.openssl.org/source/license.html
7
8// ====================================================================
9// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10// project. The module is, however, dual licensed under OpenSSL and
11// CRYPTOGAMS licenses depending on where you obtain it. For further
12// details see http://www.openssl.org/~appro/cryptogams/.
13//
14// Permission to use under GPLv2 terms is granted.
15// ====================================================================
16//
17// SHA256/512 for ARMv8.
18//
19// Performance in cycles per processed byte and improvement coefficient
20// over code generated with "default" compiler:
21//
22//		SHA256-hw	SHA256(*)	SHA512
23// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
24// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
25// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
26// Denver	2.01		10.5 (+26%)	6.70 (+8%)
27// X-Gene			20.0 (+100%)	12.8 (+300%(***))
28// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
29// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
30//
31// (*)	Software SHA256 results are of lesser relevance, presented
32//	mostly for informational purposes.
33// (**)	The result is a trade-off: it's possible to improve it by
34//	10% (or by 1 cycle per round), but at the cost of 20% loss
35//	on Cortex-A53 (or by 4 cycles per round).
36// (***)	Super-impressive coefficients over gcc-generated code are
37//	indication of some compiler "pathology", most notably code
38//	generated with -mgeneral-regs-only is significantly faster
39//	and the gap is only 40-90%.
40//
41// October 2016.
42//
43// Originally it was reckoned that it makes no sense to implement NEON
44// version of SHA256 for 64-bit processors. This is because performance
45// improvement on most wide-spread Cortex-A5x processors was observed
46// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
47// observed that 32-bit NEON SHA256 performs significantly better than
48// 64-bit scalar version on *some* of the more recent processors. As
49// result 64-bit NEON version of SHA256 was added to provide best
50// all-round performance. For example it executes ~30% faster on X-Gene
51// and Mongoose. [For reference, NEON version of SHA512 is bound to
52// deliver much less improvement, likely *negative* on Cortex-A5x.
53// Which is why NEON support is limited to SHA256.]
54
55#ifndef	__KERNEL__
56# include "arm_arch.h"
57#endif
58
59.text
60
61
62.hidden	OPENSSL_armcap_P
63.globl	sha512_block_data_order
64.type	sha512_block_data_order,%function
65.align	6
66sha512_block_data_order:
67#ifndef	__KERNEL__
68# ifdef	__ILP32__
69	ldrsw	x16,.LOPENSSL_armcap_P
70# else
71	ldr	x16,.LOPENSSL_armcap_P
72# endif
73	adr	x17,.LOPENSSL_armcap_P
74	add	x16,x16,x17
75	ldr	w16,[x16]
76	tst	w16,#ARMV8_SHA512
77	b.ne	.Lv8_entry
78#endif
79.inst	0xd503233f				// paciasp
80	stp	x29,x30,[sp,#-128]!
81	add	x29,sp,#0
82
83	stp	x19,x20,[sp,#16]
84	stp	x21,x22,[sp,#32]
85	stp	x23,x24,[sp,#48]
86	stp	x25,x26,[sp,#64]
87	stp	x27,x28,[sp,#80]
88	sub	sp,sp,#4*8
89
90	ldp	x20,x21,[x0]				// load context
91	ldp	x22,x23,[x0,#2*8]
92	ldp	x24,x25,[x0,#4*8]
93	add	x2,x1,x2,lsl#7	// end of input
94	ldp	x26,x27,[x0,#6*8]
95	adr	x30,.LK512
96	stp	x0,x2,[x29,#96]
97
98.Loop:
99	ldp	x3,x4,[x1],#2*8
100	ldr	x19,[x30],#8			// *K++
101	eor	x28,x21,x22				// magic seed
102	str	x1,[x29,#112]
103#ifndef	__AARCH64EB__
104	rev	x3,x3			// 0
105#endif
106	ror	x16,x24,#14
107	add	x27,x27,x19			// h+=K[i]
108	eor	x6,x24,x24,ror#23
109	and	x17,x25,x24
110	bic	x19,x26,x24
111	add	x27,x27,x3			// h+=X[i]
112	orr	x17,x17,x19			// Ch(e,f,g)
113	eor	x19,x20,x21			// a^b, b^c in next round
114	eor	x16,x16,x6,ror#18	// Sigma1(e)
115	ror	x6,x20,#28
116	add	x27,x27,x17			// h+=Ch(e,f,g)
117	eor	x17,x20,x20,ror#5
118	add	x27,x27,x16			// h+=Sigma1(e)
119	and	x28,x28,x19			// (b^c)&=(a^b)
120	add	x23,x23,x27			// d+=h
121	eor	x28,x28,x21			// Maj(a,b,c)
122	eor	x17,x6,x17,ror#34	// Sigma0(a)
123	add	x27,x27,x28			// h+=Maj(a,b,c)
124	ldr	x28,[x30],#8		// *K++, x19 in next round
125	//add	x27,x27,x17			// h+=Sigma0(a)
126#ifndef	__AARCH64EB__
127	rev	x4,x4			// 1
128#endif
129	ldp	x5,x6,[x1],#2*8
130	add	x27,x27,x17			// h+=Sigma0(a)
131	ror	x16,x23,#14
132	add	x26,x26,x28			// h+=K[i]
133	eor	x7,x23,x23,ror#23
134	and	x17,x24,x23
135	bic	x28,x25,x23
136	add	x26,x26,x4			// h+=X[i]
137	orr	x17,x17,x28			// Ch(e,f,g)
138	eor	x28,x27,x20			// a^b, b^c in next round
139	eor	x16,x16,x7,ror#18	// Sigma1(e)
140	ror	x7,x27,#28
141	add	x26,x26,x17			// h+=Ch(e,f,g)
142	eor	x17,x27,x27,ror#5
143	add	x26,x26,x16			// h+=Sigma1(e)
144	and	x19,x19,x28			// (b^c)&=(a^b)
145	add	x22,x22,x26			// d+=h
146	eor	x19,x19,x20			// Maj(a,b,c)
147	eor	x17,x7,x17,ror#34	// Sigma0(a)
148	add	x26,x26,x19			// h+=Maj(a,b,c)
149	ldr	x19,[x30],#8		// *K++, x28 in next round
150	//add	x26,x26,x17			// h+=Sigma0(a)
151#ifndef	__AARCH64EB__
152	rev	x5,x5			// 2
153#endif
154	add	x26,x26,x17			// h+=Sigma0(a)
155	ror	x16,x22,#14
156	add	x25,x25,x19			// h+=K[i]
157	eor	x8,x22,x22,ror#23
158	and	x17,x23,x22
159	bic	x19,x24,x22
160	add	x25,x25,x5			// h+=X[i]
161	orr	x17,x17,x19			// Ch(e,f,g)
162	eor	x19,x26,x27			// a^b, b^c in next round
163	eor	x16,x16,x8,ror#18	// Sigma1(e)
164	ror	x8,x26,#28
165	add	x25,x25,x17			// h+=Ch(e,f,g)
166	eor	x17,x26,x26,ror#5
167	add	x25,x25,x16			// h+=Sigma1(e)
168	and	x28,x28,x19			// (b^c)&=(a^b)
169	add	x21,x21,x25			// d+=h
170	eor	x28,x28,x27			// Maj(a,b,c)
171	eor	x17,x8,x17,ror#34	// Sigma0(a)
172	add	x25,x25,x28			// h+=Maj(a,b,c)
173	ldr	x28,[x30],#8		// *K++, x19 in next round
174	//add	x25,x25,x17			// h+=Sigma0(a)
175#ifndef	__AARCH64EB__
176	rev	x6,x6			// 3
177#endif
178	ldp	x7,x8,[x1],#2*8
179	add	x25,x25,x17			// h+=Sigma0(a)
180	ror	x16,x21,#14
181	add	x24,x24,x28			// h+=K[i]
182	eor	x9,x21,x21,ror#23
183	and	x17,x22,x21
184	bic	x28,x23,x21
185	add	x24,x24,x6			// h+=X[i]
186	orr	x17,x17,x28			// Ch(e,f,g)
187	eor	x28,x25,x26			// a^b, b^c in next round
188	eor	x16,x16,x9,ror#18	// Sigma1(e)
189	ror	x9,x25,#28
190	add	x24,x24,x17			// h+=Ch(e,f,g)
191	eor	x17,x25,x25,ror#5
192	add	x24,x24,x16			// h+=Sigma1(e)
193	and	x19,x19,x28			// (b^c)&=(a^b)
194	add	x20,x20,x24			// d+=h
195	eor	x19,x19,x26			// Maj(a,b,c)
196	eor	x17,x9,x17,ror#34	// Sigma0(a)
197	add	x24,x24,x19			// h+=Maj(a,b,c)
198	ldr	x19,[x30],#8		// *K++, x28 in next round
199	//add	x24,x24,x17			// h+=Sigma0(a)
200#ifndef	__AARCH64EB__
201	rev	x7,x7			// 4
202#endif
203	add	x24,x24,x17			// h+=Sigma0(a)
204	ror	x16,x20,#14
205	add	x23,x23,x19			// h+=K[i]
206	eor	x10,x20,x20,ror#23
207	and	x17,x21,x20
208	bic	x19,x22,x20
209	add	x23,x23,x7			// h+=X[i]
210	orr	x17,x17,x19			// Ch(e,f,g)
211	eor	x19,x24,x25			// a^b, b^c in next round
212	eor	x16,x16,x10,ror#18	// Sigma1(e)
213	ror	x10,x24,#28
214	add	x23,x23,x17			// h+=Ch(e,f,g)
215	eor	x17,x24,x24,ror#5
216	add	x23,x23,x16			// h+=Sigma1(e)
217	and	x28,x28,x19			// (b^c)&=(a^b)
218	add	x27,x27,x23			// d+=h
219	eor	x28,x28,x25			// Maj(a,b,c)
220	eor	x17,x10,x17,ror#34	// Sigma0(a)
221	add	x23,x23,x28			// h+=Maj(a,b,c)
222	ldr	x28,[x30],#8		// *K++, x19 in next round
223	//add	x23,x23,x17			// h+=Sigma0(a)
224#ifndef	__AARCH64EB__
225	rev	x8,x8			// 5
226#endif
227	ldp	x9,x10,[x1],#2*8
228	add	x23,x23,x17			// h+=Sigma0(a)
229	ror	x16,x27,#14
230	add	x22,x22,x28			// h+=K[i]
231	eor	x11,x27,x27,ror#23
232	and	x17,x20,x27
233	bic	x28,x21,x27
234	add	x22,x22,x8			// h+=X[i]
235	orr	x17,x17,x28			// Ch(e,f,g)
236	eor	x28,x23,x24			// a^b, b^c in next round
237	eor	x16,x16,x11,ror#18	// Sigma1(e)
238	ror	x11,x23,#28
239	add	x22,x22,x17			// h+=Ch(e,f,g)
240	eor	x17,x23,x23,ror#5
241	add	x22,x22,x16			// h+=Sigma1(e)
242	and	x19,x19,x28			// (b^c)&=(a^b)
243	add	x26,x26,x22			// d+=h
244	eor	x19,x19,x24			// Maj(a,b,c)
245	eor	x17,x11,x17,ror#34	// Sigma0(a)
246	add	x22,x22,x19			// h+=Maj(a,b,c)
247	ldr	x19,[x30],#8		// *K++, x28 in next round
248	//add	x22,x22,x17			// h+=Sigma0(a)
249#ifndef	__AARCH64EB__
250	rev	x9,x9			// 6
251#endif
252	add	x22,x22,x17			// h+=Sigma0(a)
253	ror	x16,x26,#14
254	add	x21,x21,x19			// h+=K[i]
255	eor	x12,x26,x26,ror#23
256	and	x17,x27,x26
257	bic	x19,x20,x26
258	add	x21,x21,x9			// h+=X[i]
259	orr	x17,x17,x19			// Ch(e,f,g)
260	eor	x19,x22,x23			// a^b, b^c in next round
261	eor	x16,x16,x12,ror#18	// Sigma1(e)
262	ror	x12,x22,#28
263	add	x21,x21,x17			// h+=Ch(e,f,g)
264	eor	x17,x22,x22,ror#5
265	add	x21,x21,x16			// h+=Sigma1(e)
266	and	x28,x28,x19			// (b^c)&=(a^b)
267	add	x25,x25,x21			// d+=h
268	eor	x28,x28,x23			// Maj(a,b,c)
269	eor	x17,x12,x17,ror#34	// Sigma0(a)
270	add	x21,x21,x28			// h+=Maj(a,b,c)
271	ldr	x28,[x30],#8		// *K++, x19 in next round
272	//add	x21,x21,x17			// h+=Sigma0(a)
273#ifndef	__AARCH64EB__
274	rev	x10,x10			// 7
275#endif
276	ldp	x11,x12,[x1],#2*8
277	add	x21,x21,x17			// h+=Sigma0(a)
278	ror	x16,x25,#14
279	add	x20,x20,x28			// h+=K[i]
280	eor	x13,x25,x25,ror#23
281	and	x17,x26,x25
282	bic	x28,x27,x25
283	add	x20,x20,x10			// h+=X[i]
284	orr	x17,x17,x28			// Ch(e,f,g)
285	eor	x28,x21,x22			// a^b, b^c in next round
286	eor	x16,x16,x13,ror#18	// Sigma1(e)
287	ror	x13,x21,#28
288	add	x20,x20,x17			// h+=Ch(e,f,g)
289	eor	x17,x21,x21,ror#5
290	add	x20,x20,x16			// h+=Sigma1(e)
291	and	x19,x19,x28			// (b^c)&=(a^b)
292	add	x24,x24,x20			// d+=h
293	eor	x19,x19,x22			// Maj(a,b,c)
294	eor	x17,x13,x17,ror#34	// Sigma0(a)
295	add	x20,x20,x19			// h+=Maj(a,b,c)
296	ldr	x19,[x30],#8		// *K++, x28 in next round
297	//add	x20,x20,x17			// h+=Sigma0(a)
298#ifndef	__AARCH64EB__
299	rev	x11,x11			// 8
300#endif
301	add	x20,x20,x17			// h+=Sigma0(a)
302	ror	x16,x24,#14
303	add	x27,x27,x19			// h+=K[i]
304	eor	x14,x24,x24,ror#23
305	and	x17,x25,x24
306	bic	x19,x26,x24
307	add	x27,x27,x11			// h+=X[i]
308	orr	x17,x17,x19			// Ch(e,f,g)
309	eor	x19,x20,x21			// a^b, b^c in next round
310	eor	x16,x16,x14,ror#18	// Sigma1(e)
311	ror	x14,x20,#28
312	add	x27,x27,x17			// h+=Ch(e,f,g)
313	eor	x17,x20,x20,ror#5
314	add	x27,x27,x16			// h+=Sigma1(e)
315	and	x28,x28,x19			// (b^c)&=(a^b)
316	add	x23,x23,x27			// d+=h
317	eor	x28,x28,x21			// Maj(a,b,c)
318	eor	x17,x14,x17,ror#34	// Sigma0(a)
319	add	x27,x27,x28			// h+=Maj(a,b,c)
320	ldr	x28,[x30],#8		// *K++, x19 in next round
321	//add	x27,x27,x17			// h+=Sigma0(a)
322#ifndef	__AARCH64EB__
323	rev	x12,x12			// 9
324#endif
325	ldp	x13,x14,[x1],#2*8
326	add	x27,x27,x17			// h+=Sigma0(a)
327	ror	x16,x23,#14
328	add	x26,x26,x28			// h+=K[i]
329	eor	x15,x23,x23,ror#23
330	and	x17,x24,x23
331	bic	x28,x25,x23
332	add	x26,x26,x12			// h+=X[i]
333	orr	x17,x17,x28			// Ch(e,f,g)
334	eor	x28,x27,x20			// a^b, b^c in next round
335	eor	x16,x16,x15,ror#18	// Sigma1(e)
336	ror	x15,x27,#28
337	add	x26,x26,x17			// h+=Ch(e,f,g)
338	eor	x17,x27,x27,ror#5
339	add	x26,x26,x16			// h+=Sigma1(e)
340	and	x19,x19,x28			// (b^c)&=(a^b)
341	add	x22,x22,x26			// d+=h
342	eor	x19,x19,x20			// Maj(a,b,c)
343	eor	x17,x15,x17,ror#34	// Sigma0(a)
344	add	x26,x26,x19			// h+=Maj(a,b,c)
345	ldr	x19,[x30],#8		// *K++, x28 in next round
346	//add	x26,x26,x17			// h+=Sigma0(a)
347#ifndef	__AARCH64EB__
348	rev	x13,x13			// 10
349#endif
350	add	x26,x26,x17			// h+=Sigma0(a)
351	ror	x16,x22,#14
352	add	x25,x25,x19			// h+=K[i]
353	eor	x0,x22,x22,ror#23
354	and	x17,x23,x22
355	bic	x19,x24,x22
356	add	x25,x25,x13			// h+=X[i]
357	orr	x17,x17,x19			// Ch(e,f,g)
358	eor	x19,x26,x27			// a^b, b^c in next round
359	eor	x16,x16,x0,ror#18	// Sigma1(e)
360	ror	x0,x26,#28
361	add	x25,x25,x17			// h+=Ch(e,f,g)
362	eor	x17,x26,x26,ror#5
363	add	x25,x25,x16			// h+=Sigma1(e)
364	and	x28,x28,x19			// (b^c)&=(a^b)
365	add	x21,x21,x25			// d+=h
366	eor	x28,x28,x27			// Maj(a,b,c)
367	eor	x17,x0,x17,ror#34	// Sigma0(a)
368	add	x25,x25,x28			// h+=Maj(a,b,c)
369	ldr	x28,[x30],#8		// *K++, x19 in next round
370	//add	x25,x25,x17			// h+=Sigma0(a)
371#ifndef	__AARCH64EB__
372	rev	x14,x14			// 11
373#endif
374	ldp	x15,x0,[x1],#2*8
375	add	x25,x25,x17			// h+=Sigma0(a)
376	str	x6,[sp,#24]
377	ror	x16,x21,#14
378	add	x24,x24,x28			// h+=K[i]
379	eor	x6,x21,x21,ror#23
380	and	x17,x22,x21
381	bic	x28,x23,x21
382	add	x24,x24,x14			// h+=X[i]
383	orr	x17,x17,x28			// Ch(e,f,g)
384	eor	x28,x25,x26			// a^b, b^c in next round
385	eor	x16,x16,x6,ror#18	// Sigma1(e)
386	ror	x6,x25,#28
387	add	x24,x24,x17			// h+=Ch(e,f,g)
388	eor	x17,x25,x25,ror#5
389	add	x24,x24,x16			// h+=Sigma1(e)
390	and	x19,x19,x28			// (b^c)&=(a^b)
391	add	x20,x20,x24			// d+=h
392	eor	x19,x19,x26			// Maj(a,b,c)
393	eor	x17,x6,x17,ror#34	// Sigma0(a)
394	add	x24,x24,x19			// h+=Maj(a,b,c)
395	ldr	x19,[x30],#8		// *K++, x28 in next round
396	//add	x24,x24,x17			// h+=Sigma0(a)
397#ifndef	__AARCH64EB__
398	rev	x15,x15			// 12
399#endif
400	add	x24,x24,x17			// h+=Sigma0(a)
401	str	x7,[sp,#0]
402	ror	x16,x20,#14
403	add	x23,x23,x19			// h+=K[i]
404	eor	x7,x20,x20,ror#23
405	and	x17,x21,x20
406	bic	x19,x22,x20
407	add	x23,x23,x15			// h+=X[i]
408	orr	x17,x17,x19			// Ch(e,f,g)
409	eor	x19,x24,x25			// a^b, b^c in next round
410	eor	x16,x16,x7,ror#18	// Sigma1(e)
411	ror	x7,x24,#28
412	add	x23,x23,x17			// h+=Ch(e,f,g)
413	eor	x17,x24,x24,ror#5
414	add	x23,x23,x16			// h+=Sigma1(e)
415	and	x28,x28,x19			// (b^c)&=(a^b)
416	add	x27,x27,x23			// d+=h
417	eor	x28,x28,x25			// Maj(a,b,c)
418	eor	x17,x7,x17,ror#34	// Sigma0(a)
419	add	x23,x23,x28			// h+=Maj(a,b,c)
420	ldr	x28,[x30],#8		// *K++, x19 in next round
421	//add	x23,x23,x17			// h+=Sigma0(a)
422#ifndef	__AARCH64EB__
423	rev	x0,x0			// 13
424#endif
425	ldp	x1,x2,[x1]
426	add	x23,x23,x17			// h+=Sigma0(a)
427	str	x8,[sp,#8]
428	ror	x16,x27,#14
429	add	x22,x22,x28			// h+=K[i]
430	eor	x8,x27,x27,ror#23
431	and	x17,x20,x27
432	bic	x28,x21,x27
433	add	x22,x22,x0			// h+=X[i]
434	orr	x17,x17,x28			// Ch(e,f,g)
435	eor	x28,x23,x24			// a^b, b^c in next round
436	eor	x16,x16,x8,ror#18	// Sigma1(e)
437	ror	x8,x23,#28
438	add	x22,x22,x17			// h+=Ch(e,f,g)
439	eor	x17,x23,x23,ror#5
440	add	x22,x22,x16			// h+=Sigma1(e)
441	and	x19,x19,x28			// (b^c)&=(a^b)
442	add	x26,x26,x22			// d+=h
443	eor	x19,x19,x24			// Maj(a,b,c)
444	eor	x17,x8,x17,ror#34	// Sigma0(a)
445	add	x22,x22,x19			// h+=Maj(a,b,c)
446	ldr	x19,[x30],#8		// *K++, x28 in next round
447	//add	x22,x22,x17			// h+=Sigma0(a)
448#ifndef	__AARCH64EB__
449	rev	x1,x1			// 14
450#endif
451	ldr	x6,[sp,#24]
452	add	x22,x22,x17			// h+=Sigma0(a)
453	str	x9,[sp,#16]
454	ror	x16,x26,#14
455	add	x21,x21,x19			// h+=K[i]
456	eor	x9,x26,x26,ror#23
457	and	x17,x27,x26
458	bic	x19,x20,x26
459	add	x21,x21,x1			// h+=X[i]
460	orr	x17,x17,x19			// Ch(e,f,g)
461	eor	x19,x22,x23			// a^b, b^c in next round
462	eor	x16,x16,x9,ror#18	// Sigma1(e)
463	ror	x9,x22,#28
464	add	x21,x21,x17			// h+=Ch(e,f,g)
465	eor	x17,x22,x22,ror#5
466	add	x21,x21,x16			// h+=Sigma1(e)
467	and	x28,x28,x19			// (b^c)&=(a^b)
468	add	x25,x25,x21			// d+=h
469	eor	x28,x28,x23			// Maj(a,b,c)
470	eor	x17,x9,x17,ror#34	// Sigma0(a)
471	add	x21,x21,x28			// h+=Maj(a,b,c)
472	ldr	x28,[x30],#8		// *K++, x19 in next round
473	//add	x21,x21,x17			// h+=Sigma0(a)
474#ifndef	__AARCH64EB__
475	rev	x2,x2			// 15
476#endif
477	ldr	x7,[sp,#0]
478	add	x21,x21,x17			// h+=Sigma0(a)
479	str	x10,[sp,#24]
480	ror	x16,x25,#14
481	add	x20,x20,x28			// h+=K[i]
482	ror	x9,x4,#1
483	and	x17,x26,x25
484	ror	x8,x1,#19
485	bic	x28,x27,x25
486	ror	x10,x21,#28
487	add	x20,x20,x2			// h+=X[i]
488	eor	x16,x16,x25,ror#18
489	eor	x9,x9,x4,ror#8
490	orr	x17,x17,x28			// Ch(e,f,g)
491	eor	x28,x21,x22			// a^b, b^c in next round
492	eor	x16,x16,x25,ror#41	// Sigma1(e)
493	eor	x10,x10,x21,ror#34
494	add	x20,x20,x17			// h+=Ch(e,f,g)
495	and	x19,x19,x28			// (b^c)&=(a^b)
496	eor	x8,x8,x1,ror#61
497	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
498	add	x20,x20,x16			// h+=Sigma1(e)
499	eor	x19,x19,x22			// Maj(a,b,c)
500	eor	x17,x10,x21,ror#39	// Sigma0(a)
501	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
502	add	x3,x3,x12
503	add	x24,x24,x20			// d+=h
504	add	x20,x20,x19			// h+=Maj(a,b,c)
505	ldr	x19,[x30],#8		// *K++, x28 in next round
506	add	x3,x3,x9
507	add	x20,x20,x17			// h+=Sigma0(a)
508	add	x3,x3,x8
509.Loop_16_xx:
510	ldr	x8,[sp,#8]
511	str	x11,[sp,#0]
512	ror	x16,x24,#14
513	add	x27,x27,x19			// h+=K[i]
514	ror	x10,x5,#1
515	and	x17,x25,x24
516	ror	x9,x2,#19
517	bic	x19,x26,x24
518	ror	x11,x20,#28
519	add	x27,x27,x3			// h+=X[i]
520	eor	x16,x16,x24,ror#18
521	eor	x10,x10,x5,ror#8
522	orr	x17,x17,x19			// Ch(e,f,g)
523	eor	x19,x20,x21			// a^b, b^c in next round
524	eor	x16,x16,x24,ror#41	// Sigma1(e)
525	eor	x11,x11,x20,ror#34
526	add	x27,x27,x17			// h+=Ch(e,f,g)
527	and	x28,x28,x19			// (b^c)&=(a^b)
528	eor	x9,x9,x2,ror#61
529	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
530	add	x27,x27,x16			// h+=Sigma1(e)
531	eor	x28,x28,x21			// Maj(a,b,c)
532	eor	x17,x11,x20,ror#39	// Sigma0(a)
533	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
534	add	x4,x4,x13
535	add	x23,x23,x27			// d+=h
536	add	x27,x27,x28			// h+=Maj(a,b,c)
537	ldr	x28,[x30],#8		// *K++, x19 in next round
538	add	x4,x4,x10
539	add	x27,x27,x17			// h+=Sigma0(a)
540	add	x4,x4,x9
541	ldr	x9,[sp,#16]
542	str	x12,[sp,#8]
543	ror	x16,x23,#14
544	add	x26,x26,x28			// h+=K[i]
545	ror	x11,x6,#1
546	and	x17,x24,x23
547	ror	x10,x3,#19
548	bic	x28,x25,x23
549	ror	x12,x27,#28
550	add	x26,x26,x4			// h+=X[i]
551	eor	x16,x16,x23,ror#18
552	eor	x11,x11,x6,ror#8
553	orr	x17,x17,x28			// Ch(e,f,g)
554	eor	x28,x27,x20			// a^b, b^c in next round
555	eor	x16,x16,x23,ror#41	// Sigma1(e)
556	eor	x12,x12,x27,ror#34
557	add	x26,x26,x17			// h+=Ch(e,f,g)
558	and	x19,x19,x28			// (b^c)&=(a^b)
559	eor	x10,x10,x3,ror#61
560	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
561	add	x26,x26,x16			// h+=Sigma1(e)
562	eor	x19,x19,x20			// Maj(a,b,c)
563	eor	x17,x12,x27,ror#39	// Sigma0(a)
564	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
565	add	x5,x5,x14
566	add	x22,x22,x26			// d+=h
567	add	x26,x26,x19			// h+=Maj(a,b,c)
568	ldr	x19,[x30],#8		// *K++, x28 in next round
569	add	x5,x5,x11
570	add	x26,x26,x17			// h+=Sigma0(a)
571	add	x5,x5,x10
572	ldr	x10,[sp,#24]
573	str	x13,[sp,#16]
574	ror	x16,x22,#14
575	add	x25,x25,x19			// h+=K[i]
576	ror	x12,x7,#1
577	and	x17,x23,x22
578	ror	x11,x4,#19
579	bic	x19,x24,x22
580	ror	x13,x26,#28
581	add	x25,x25,x5			// h+=X[i]
582	eor	x16,x16,x22,ror#18
583	eor	x12,x12,x7,ror#8
584	orr	x17,x17,x19			// Ch(e,f,g)
585	eor	x19,x26,x27			// a^b, b^c in next round
586	eor	x16,x16,x22,ror#41	// Sigma1(e)
587	eor	x13,x13,x26,ror#34
588	add	x25,x25,x17			// h+=Ch(e,f,g)
589	and	x28,x28,x19			// (b^c)&=(a^b)
590	eor	x11,x11,x4,ror#61
591	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
592	add	x25,x25,x16			// h+=Sigma1(e)
593	eor	x28,x28,x27			// Maj(a,b,c)
594	eor	x17,x13,x26,ror#39	// Sigma0(a)
595	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
596	add	x6,x6,x15
597	add	x21,x21,x25			// d+=h
598	add	x25,x25,x28			// h+=Maj(a,b,c)
599	ldr	x28,[x30],#8		// *K++, x19 in next round
600	add	x6,x6,x12
601	add	x25,x25,x17			// h+=Sigma0(a)
602	add	x6,x6,x11
603	ldr	x11,[sp,#0]
604	str	x14,[sp,#24]
605	ror	x16,x21,#14
606	add	x24,x24,x28			// h+=K[i]
607	ror	x13,x8,#1
608	and	x17,x22,x21
609	ror	x12,x5,#19
610	bic	x28,x23,x21
611	ror	x14,x25,#28
612	add	x24,x24,x6			// h+=X[i]
613	eor	x16,x16,x21,ror#18
614	eor	x13,x13,x8,ror#8
615	orr	x17,x17,x28			// Ch(e,f,g)
616	eor	x28,x25,x26			// a^b, b^c in next round
617	eor	x16,x16,x21,ror#41	// Sigma1(e)
618	eor	x14,x14,x25,ror#34
619	add	x24,x24,x17			// h+=Ch(e,f,g)
620	and	x19,x19,x28			// (b^c)&=(a^b)
621	eor	x12,x12,x5,ror#61
622	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
623	add	x24,x24,x16			// h+=Sigma1(e)
624	eor	x19,x19,x26			// Maj(a,b,c)
625	eor	x17,x14,x25,ror#39	// Sigma0(a)
626	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
627	add	x7,x7,x0
628	add	x20,x20,x24			// d+=h
629	add	x24,x24,x19			// h+=Maj(a,b,c)
630	ldr	x19,[x30],#8		// *K++, x28 in next round
631	add	x7,x7,x13
632	add	x24,x24,x17			// h+=Sigma0(a)
633	add	x7,x7,x12
634	ldr	x12,[sp,#8]
635	str	x15,[sp,#0]
636	ror	x16,x20,#14
637	add	x23,x23,x19			// h+=K[i]
638	ror	x14,x9,#1
639	and	x17,x21,x20
640	ror	x13,x6,#19
641	bic	x19,x22,x20
642	ror	x15,x24,#28
643	add	x23,x23,x7			// h+=X[i]
644	eor	x16,x16,x20,ror#18
645	eor	x14,x14,x9,ror#8
646	orr	x17,x17,x19			// Ch(e,f,g)
647	eor	x19,x24,x25			// a^b, b^c in next round
648	eor	x16,x16,x20,ror#41	// Sigma1(e)
649	eor	x15,x15,x24,ror#34
650	add	x23,x23,x17			// h+=Ch(e,f,g)
651	and	x28,x28,x19			// (b^c)&=(a^b)
652	eor	x13,x13,x6,ror#61
653	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
654	add	x23,x23,x16			// h+=Sigma1(e)
655	eor	x28,x28,x25			// Maj(a,b,c)
656	eor	x17,x15,x24,ror#39	// Sigma0(a)
657	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
658	add	x8,x8,x1
659	add	x27,x27,x23			// d+=h
660	add	x23,x23,x28			// h+=Maj(a,b,c)
661	ldr	x28,[x30],#8		// *K++, x19 in next round
662	add	x8,x8,x14
663	add	x23,x23,x17			// h+=Sigma0(a)
664	add	x8,x8,x13
665	ldr	x13,[sp,#16]
666	str	x0,[sp,#8]
667	ror	x16,x27,#14
668	add	x22,x22,x28			// h+=K[i]
669	ror	x15,x10,#1
670	and	x17,x20,x27
671	ror	x14,x7,#19
672	bic	x28,x21,x27
673	ror	x0,x23,#28
674	add	x22,x22,x8			// h+=X[i]
675	eor	x16,x16,x27,ror#18
676	eor	x15,x15,x10,ror#8
677	orr	x17,x17,x28			// Ch(e,f,g)
678	eor	x28,x23,x24			// a^b, b^c in next round
679	eor	x16,x16,x27,ror#41	// Sigma1(e)
680	eor	x0,x0,x23,ror#34
681	add	x22,x22,x17			// h+=Ch(e,f,g)
682	and	x19,x19,x28			// (b^c)&=(a^b)
683	eor	x14,x14,x7,ror#61
684	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
685	add	x22,x22,x16			// h+=Sigma1(e)
686	eor	x19,x19,x24			// Maj(a,b,c)
687	eor	x17,x0,x23,ror#39	// Sigma0(a)
688	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
689	add	x9,x9,x2
690	add	x26,x26,x22			// d+=h
691	add	x22,x22,x19			// h+=Maj(a,b,c)
692	ldr	x19,[x30],#8		// *K++, x28 in next round
693	add	x9,x9,x15
694	add	x22,x22,x17			// h+=Sigma0(a)
695	add	x9,x9,x14
696	ldr	x14,[sp,#24]
697	str	x1,[sp,#16]
698	ror	x16,x26,#14
699	add	x21,x21,x19			// h+=K[i]
700	ror	x0,x11,#1
701	and	x17,x27,x26
702	ror	x15,x8,#19
703	bic	x19,x20,x26
704	ror	x1,x22,#28
705	add	x21,x21,x9			// h+=X[i]
706	eor	x16,x16,x26,ror#18
707	eor	x0,x0,x11,ror#8
708	orr	x17,x17,x19			// Ch(e,f,g)
709	eor	x19,x22,x23			// a^b, b^c in next round
710	eor	x16,x16,x26,ror#41	// Sigma1(e)
711	eor	x1,x1,x22,ror#34
712	add	x21,x21,x17			// h+=Ch(e,f,g)
713	and	x28,x28,x19			// (b^c)&=(a^b)
714	eor	x15,x15,x8,ror#61
715	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
716	add	x21,x21,x16			// h+=Sigma1(e)
717	eor	x28,x28,x23			// Maj(a,b,c)
718	eor	x17,x1,x22,ror#39	// Sigma0(a)
719	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
720	add	x10,x10,x3
721	add	x25,x25,x21			// d+=h
722	add	x21,x21,x28			// h+=Maj(a,b,c)
723	ldr	x28,[x30],#8		// *K++, x19 in next round
724	add	x10,x10,x0
725	add	x21,x21,x17			// h+=Sigma0(a)
726	add	x10,x10,x15
727	ldr	x15,[sp,#0]
728	str	x2,[sp,#24]
729	ror	x16,x25,#14
730	add	x20,x20,x28			// h+=K[i]
731	ror	x1,x12,#1
732	and	x17,x26,x25
733	ror	x0,x9,#19
734	bic	x28,x27,x25
735	ror	x2,x21,#28
736	add	x20,x20,x10			// h+=X[i]
737	eor	x16,x16,x25,ror#18
738	eor	x1,x1,x12,ror#8
739	orr	x17,x17,x28			// Ch(e,f,g)
740	eor	x28,x21,x22			// a^b, b^c in next round
741	eor	x16,x16,x25,ror#41	// Sigma1(e)
742	eor	x2,x2,x21,ror#34
743	add	x20,x20,x17			// h+=Ch(e,f,g)
744	and	x19,x19,x28			// (b^c)&=(a^b)
745	eor	x0,x0,x9,ror#61
746	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
747	add	x20,x20,x16			// h+=Sigma1(e)
748	eor	x19,x19,x22			// Maj(a,b,c)
749	eor	x17,x2,x21,ror#39	// Sigma0(a)
750	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
751	add	x11,x11,x4
752	add	x24,x24,x20			// d+=h
753	add	x20,x20,x19			// h+=Maj(a,b,c)
754	ldr	x19,[x30],#8		// *K++, x28 in next round
755	add	x11,x11,x1
756	add	x20,x20,x17			// h+=Sigma0(a)
757	add	x11,x11,x0
758	ldr	x0,[sp,#8]
759	str	x3,[sp,#0]
760	ror	x16,x24,#14
761	add	x27,x27,x19			// h+=K[i]
762	ror	x2,x13,#1
763	and	x17,x25,x24
764	ror	x1,x10,#19
765	bic	x19,x26,x24
766	ror	x3,x20,#28
767	add	x27,x27,x11			// h+=X[i]
768	eor	x16,x16,x24,ror#18
769	eor	x2,x2,x13,ror#8
770	orr	x17,x17,x19			// Ch(e,f,g)
771	eor	x19,x20,x21			// a^b, b^c in next round
772	eor	x16,x16,x24,ror#41	// Sigma1(e)
773	eor	x3,x3,x20,ror#34
774	add	x27,x27,x17			// h+=Ch(e,f,g)
775	and	x28,x28,x19			// (b^c)&=(a^b)
776	eor	x1,x1,x10,ror#61
777	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
778	add	x27,x27,x16			// h+=Sigma1(e)
779	eor	x28,x28,x21			// Maj(a,b,c)
780	eor	x17,x3,x20,ror#39	// Sigma0(a)
781	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
782	add	x12,x12,x5
783	add	x23,x23,x27			// d+=h
784	add	x27,x27,x28			// h+=Maj(a,b,c)
785	ldr	x28,[x30],#8		// *K++, x19 in next round
786	add	x12,x12,x2
787	add	x27,x27,x17			// h+=Sigma0(a)
788	add	x12,x12,x1
789	ldr	x1,[sp,#16]
790	str	x4,[sp,#8]
791	ror	x16,x23,#14
792	add	x26,x26,x28			// h+=K[i]
793	ror	x3,x14,#1
794	and	x17,x24,x23
795	ror	x2,x11,#19
796	bic	x28,x25,x23
797	ror	x4,x27,#28
798	add	x26,x26,x12			// h+=X[i]
799	eor	x16,x16,x23,ror#18
800	eor	x3,x3,x14,ror#8
801	orr	x17,x17,x28			// Ch(e,f,g)
802	eor	x28,x27,x20			// a^b, b^c in next round
803	eor	x16,x16,x23,ror#41	// Sigma1(e)
804	eor	x4,x4,x27,ror#34
805	add	x26,x26,x17			// h+=Ch(e,f,g)
806	and	x19,x19,x28			// (b^c)&=(a^b)
807	eor	x2,x2,x11,ror#61
808	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
809	add	x26,x26,x16			// h+=Sigma1(e)
810	eor	x19,x19,x20			// Maj(a,b,c)
811	eor	x17,x4,x27,ror#39	// Sigma0(a)
812	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
813	add	x13,x13,x6
814	add	x22,x22,x26			// d+=h
815	add	x26,x26,x19			// h+=Maj(a,b,c)
816	ldr	x19,[x30],#8		// *K++, x28 in next round
817	add	x13,x13,x3
818	add	x26,x26,x17			// h+=Sigma0(a)
819	add	x13,x13,x2
820	ldr	x2,[sp,#24]
821	str	x5,[sp,#16]
822	ror	x16,x22,#14
823	add	x25,x25,x19			// h+=K[i]
824	ror	x4,x15,#1
825	and	x17,x23,x22
826	ror	x3,x12,#19
827	bic	x19,x24,x22
828	ror	x5,x26,#28
829	add	x25,x25,x13			// h+=X[i]
830	eor	x16,x16,x22,ror#18
831	eor	x4,x4,x15,ror#8
832	orr	x17,x17,x19			// Ch(e,f,g)
833	eor	x19,x26,x27			// a^b, b^c in next round
834	eor	x16,x16,x22,ror#41	// Sigma1(e)
835	eor	x5,x5,x26,ror#34
836	add	x25,x25,x17			// h+=Ch(e,f,g)
837	and	x28,x28,x19			// (b^c)&=(a^b)
838	eor	x3,x3,x12,ror#61
839	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
840	add	x25,x25,x16			// h+=Sigma1(e)
841	eor	x28,x28,x27			// Maj(a,b,c)
842	eor	x17,x5,x26,ror#39	// Sigma0(a)
843	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
844	add	x14,x14,x7
845	add	x21,x21,x25			// d+=h
846	add	x25,x25,x28			// h+=Maj(a,b,c)
847	ldr	x28,[x30],#8		// *K++, x19 in next round
848	add	x14,x14,x4
849	add	x25,x25,x17			// h+=Sigma0(a)
850	add	x14,x14,x3
851	ldr	x3,[sp,#0]
852	str	x6,[sp,#24]
853	ror	x16,x21,#14
854	add	x24,x24,x28			// h+=K[i]
855	ror	x5,x0,#1
856	and	x17,x22,x21
857	ror	x4,x13,#19
858	bic	x28,x23,x21
859	ror	x6,x25,#28
860	add	x24,x24,x14			// h+=X[i]
861	eor	x16,x16,x21,ror#18
862	eor	x5,x5,x0,ror#8
863	orr	x17,x17,x28			// Ch(e,f,g)
864	eor	x28,x25,x26			// a^b, b^c in next round
865	eor	x16,x16,x21,ror#41	// Sigma1(e)
866	eor	x6,x6,x25,ror#34
867	add	x24,x24,x17			// h+=Ch(e,f,g)
868	and	x19,x19,x28			// (b^c)&=(a^b)
869	eor	x4,x4,x13,ror#61
870	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
871	add	x24,x24,x16			// h+=Sigma1(e)
872	eor	x19,x19,x26			// Maj(a,b,c)
873	eor	x17,x6,x25,ror#39	// Sigma0(a)
874	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
875	add	x15,x15,x8
876	add	x20,x20,x24			// d+=h
877	add	x24,x24,x19			// h+=Maj(a,b,c)
878	ldr	x19,[x30],#8		// *K++, x28 in next round
879	add	x15,x15,x5
880	add	x24,x24,x17			// h+=Sigma0(a)
881	add	x15,x15,x4
882	ldr	x4,[sp,#8]
883	str	x7,[sp,#0]
884	ror	x16,x20,#14
885	add	x23,x23,x19			// h+=K[i]
886	ror	x6,x1,#1
887	and	x17,x21,x20
888	ror	x5,x14,#19
889	bic	x19,x22,x20
890	ror	x7,x24,#28
891	add	x23,x23,x15			// h+=X[i]
892	eor	x16,x16,x20,ror#18
893	eor	x6,x6,x1,ror#8
894	orr	x17,x17,x19			// Ch(e,f,g)
895	eor	x19,x24,x25			// a^b, b^c in next round
896	eor	x16,x16,x20,ror#41	// Sigma1(e)
897	eor	x7,x7,x24,ror#34
898	add	x23,x23,x17			// h+=Ch(e,f,g)
899	and	x28,x28,x19			// (b^c)&=(a^b)
900	eor	x5,x5,x14,ror#61
901	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
902	add	x23,x23,x16			// h+=Sigma1(e)
903	eor	x28,x28,x25			// Maj(a,b,c)
904	eor	x17,x7,x24,ror#39	// Sigma0(a)
905	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
906	add	x0,x0,x9
907	add	x27,x27,x23			// d+=h
908	add	x23,x23,x28			// h+=Maj(a,b,c)
909	ldr	x28,[x30],#8		// *K++, x19 in next round
910	add	x0,x0,x6
911	add	x23,x23,x17			// h+=Sigma0(a)
912	add	x0,x0,x5
913	ldr	x5,[sp,#16]
914	str	x8,[sp,#8]
915	ror	x16,x27,#14
916	add	x22,x22,x28			// h+=K[i]
917	ror	x7,x2,#1
918	and	x17,x20,x27
919	ror	x6,x15,#19
920	bic	x28,x21,x27
921	ror	x8,x23,#28
922	add	x22,x22,x0			// h+=X[i]
923	eor	x16,x16,x27,ror#18
924	eor	x7,x7,x2,ror#8
925	orr	x17,x17,x28			// Ch(e,f,g)
926	eor	x28,x23,x24			// a^b, b^c in next round
927	eor	x16,x16,x27,ror#41	// Sigma1(e)
928	eor	x8,x8,x23,ror#34
929	add	x22,x22,x17			// h+=Ch(e,f,g)
930	and	x19,x19,x28			// (b^c)&=(a^b)
931	eor	x6,x6,x15,ror#61
932	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
933	add	x22,x22,x16			// h+=Sigma1(e)
934	eor	x19,x19,x24			// Maj(a,b,c)
935	eor	x17,x8,x23,ror#39	// Sigma0(a)
936	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
937	add	x1,x1,x10
938	add	x26,x26,x22			// d+=h
939	add	x22,x22,x19			// h+=Maj(a,b,c)
940	ldr	x19,[x30],#8		// *K++, x28 in next round
941	add	x1,x1,x7
942	add	x22,x22,x17			// h+=Sigma0(a)
943	add	x1,x1,x6
944	ldr	x6,[sp,#24]
945	str	x9,[sp,#16]
946	ror	x16,x26,#14
947	add	x21,x21,x19			// h+=K[i]
948	ror	x8,x3,#1
949	and	x17,x27,x26
950	ror	x7,x0,#19
951	bic	x19,x20,x26
952	ror	x9,x22,#28
953	add	x21,x21,x1			// h+=X[i]
954	eor	x16,x16,x26,ror#18
955	eor	x8,x8,x3,ror#8
956	orr	x17,x17,x19			// Ch(e,f,g)
957	eor	x19,x22,x23			// a^b, b^c in next round
958	eor	x16,x16,x26,ror#41	// Sigma1(e)
959	eor	x9,x9,x22,ror#34
960	add	x21,x21,x17			// h+=Ch(e,f,g)
961	and	x28,x28,x19			// (b^c)&=(a^b)
962	eor	x7,x7,x0,ror#61
963	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
964	add	x21,x21,x16			// h+=Sigma1(e)
965	eor	x28,x28,x23			// Maj(a,b,c)
966	eor	x17,x9,x22,ror#39	// Sigma0(a)
967	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
968	add	x2,x2,x11
969	add	x25,x25,x21			// d+=h
970	add	x21,x21,x28			// h+=Maj(a,b,c)
971	ldr	x28,[x30],#8		// *K++, x19 in next round
972	add	x2,x2,x8
973	add	x21,x21,x17			// h+=Sigma0(a)
974	add	x2,x2,x7
975	ldr	x7,[sp,#0]
976	str	x10,[sp,#24]
977	ror	x16,x25,#14
978	add	x20,x20,x28			// h+=K[i]
979	ror	x9,x4,#1
980	and	x17,x26,x25
981	ror	x8,x1,#19
982	bic	x28,x27,x25
983	ror	x10,x21,#28
984	add	x20,x20,x2			// h+=X[i]
985	eor	x16,x16,x25,ror#18
986	eor	x9,x9,x4,ror#8
987	orr	x17,x17,x28			// Ch(e,f,g)
988	eor	x28,x21,x22			// a^b, b^c in next round
989	eor	x16,x16,x25,ror#41	// Sigma1(e)
990	eor	x10,x10,x21,ror#34
991	add	x20,x20,x17			// h+=Ch(e,f,g)
992	and	x19,x19,x28			// (b^c)&=(a^b)
993	eor	x8,x8,x1,ror#61
994	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
995	add	x20,x20,x16			// h+=Sigma1(e)
996	eor	x19,x19,x22			// Maj(a,b,c)
997	eor	x17,x10,x21,ror#39	// Sigma0(a)
998	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
999	add	x3,x3,x12
1000	add	x24,x24,x20			// d+=h
1001	add	x20,x20,x19			// h+=Maj(a,b,c)
1002	ldr	x19,[x30],#8		// *K++, x28 in next round
1003	add	x3,x3,x9
1004	add	x20,x20,x17			// h+=Sigma0(a)
1005	add	x3,x3,x8
1006	cbnz	x19,.Loop_16_xx
1007
1008	ldp	x0,x2,[x29,#96]
1009	ldr	x1,[x29,#112]
1010	sub	x30,x30,#648		// rewind
1011
1012	ldp	x3,x4,[x0]
1013	ldp	x5,x6,[x0,#2*8]
1014	add	x1,x1,#14*8			// advance input pointer
1015	ldp	x7,x8,[x0,#4*8]
1016	add	x20,x20,x3
1017	ldp	x9,x10,[x0,#6*8]
1018	add	x21,x21,x4
1019	add	x22,x22,x5
1020	add	x23,x23,x6
1021	stp	x20,x21,[x0]
1022	add	x24,x24,x7
1023	add	x25,x25,x8
1024	stp	x22,x23,[x0,#2*8]
1025	add	x26,x26,x9
1026	add	x27,x27,x10
1027	cmp	x1,x2
1028	stp	x24,x25,[x0,#4*8]
1029	stp	x26,x27,[x0,#6*8]
1030	b.ne	.Loop
1031
1032	ldp	x19,x20,[x29,#16]
1033	add	sp,sp,#4*8
1034	ldp	x21,x22,[x29,#32]
1035	ldp	x23,x24,[x29,#48]
1036	ldp	x25,x26,[x29,#64]
1037	ldp	x27,x28,[x29,#80]
1038	ldp	x29,x30,[sp],#128
1039.inst	0xd50323bf				// autiasp
1040	ret
1041.size	sha512_block_data_order,.-sha512_block_data_order
1042
1043.align	6
1044.type	.LK512,%object
1045.LK512:
1046.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1047.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1048.quad	0x3956c25bf348b538,0x59f111f1b605d019
1049.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1050.quad	0xd807aa98a3030242,0x12835b0145706fbe
1051.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1052.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1053.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1054.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1055.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1056.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1057.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1058.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1059.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1060.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1061.quad	0x06ca6351e003826f,0x142929670a0e6e70
1062.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1063.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1064.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1065.quad	0x81c2c92e47edaee6,0x92722c851482353b
1066.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1067.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1068.quad	0xd192e819d6ef5218,0xd69906245565a910
1069.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1070.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1071.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1072.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1073.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1074.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1075.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1076.quad	0x90befffa23631e28,0xa4506cebde82bde9
1077.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1078.quad	0xca273eceea26619c,0xd186b8c721c0c207
1079.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1080.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1081.quad	0x113f9804bef90dae,0x1b710b35131c471b
1082.quad	0x28db77f523047d84,0x32caab7b40c72493
1083.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1084.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1085.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1086.quad	0	// terminator
1087.size	.LK512,.-.LK512
1088#ifndef	__KERNEL__
1089.align	3
1090.LOPENSSL_armcap_P:
1091# ifdef	__ILP32__
1092.long	OPENSSL_armcap_P-.
1093# else
1094.quad	OPENSSL_armcap_P-.
1095# endif
1096#endif
1097.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1098.align	2
1099.align	2
1100#ifndef	__KERNEL__
1101.type	sha512_block_armv8,%function
1102.align	6
1103sha512_block_armv8:
1104.Lv8_entry:
1105	stp	x29,x30,[sp,#-16]!
1106	add	x29,sp,#0
1107
1108	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1109	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1110
1111	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1112	adr	x3,.LK512
1113
1114	rev64	v16.16b,v16.16b
1115	rev64	v17.16b,v17.16b
1116	rev64	v18.16b,v18.16b
1117	rev64	v19.16b,v19.16b
1118	rev64	v20.16b,v20.16b
1119	rev64	v21.16b,v21.16b
1120	rev64	v22.16b,v22.16b
1121	rev64	v23.16b,v23.16b
1122	b	.Loop_hw
1123
1124.align	4
1125.Loop_hw:
1126	ld1	{v24.2d},[x3],#16
1127	subs	x2,x2,#1
1128	sub	x4,x1,#128
1129	orr	v26.16b,v0.16b,v0.16b			// offload
1130	orr	v27.16b,v1.16b,v1.16b
1131	orr	v28.16b,v2.16b,v2.16b
1132	orr	v29.16b,v3.16b,v3.16b
1133	csel	x1,x1,x4,ne			// conditional rewind
1134	add	v24.2d,v24.2d,v16.2d
1135	ld1	{v25.2d},[x3],#16
1136	ext	v24.16b,v24.16b,v24.16b,#8
1137	ext	v5.16b,v2.16b,v3.16b,#8
1138	ext	v6.16b,v1.16b,v2.16b,#8
1139	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1140.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1141	ext	v7.16b,v20.16b,v21.16b,#8
1142.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1143.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1144	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1145.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1146	add	v25.2d,v25.2d,v17.2d
1147	ld1	{v24.2d},[x3],#16
1148	ext	v25.16b,v25.16b,v25.16b,#8
1149	ext	v5.16b,v4.16b,v2.16b,#8
1150	ext	v6.16b,v0.16b,v4.16b,#8
1151	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1152.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1153	ext	v7.16b,v21.16b,v22.16b,#8
1154.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1155.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1156	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1157.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1158	add	v24.2d,v24.2d,v18.2d
1159	ld1	{v25.2d},[x3],#16
1160	ext	v24.16b,v24.16b,v24.16b,#8
1161	ext	v5.16b,v1.16b,v4.16b,#8
1162	ext	v6.16b,v3.16b,v1.16b,#8
1163	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1164.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1165	ext	v7.16b,v22.16b,v23.16b,#8
1166.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1167.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1168	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1169.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1170	add	v25.2d,v25.2d,v19.2d
1171	ld1	{v24.2d},[x3],#16
1172	ext	v25.16b,v25.16b,v25.16b,#8
1173	ext	v5.16b,v0.16b,v1.16b,#8
1174	ext	v6.16b,v2.16b,v0.16b,#8
1175	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1176.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1177	ext	v7.16b,v23.16b,v16.16b,#8
1178.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1179.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1180	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1181.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1182	add	v24.2d,v24.2d,v20.2d
1183	ld1	{v25.2d},[x3],#16
1184	ext	v24.16b,v24.16b,v24.16b,#8
1185	ext	v5.16b,v3.16b,v0.16b,#8
1186	ext	v6.16b,v4.16b,v3.16b,#8
1187	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1188.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1189	ext	v7.16b,v16.16b,v17.16b,#8
1190.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1191.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1192	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1193.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1194	add	v25.2d,v25.2d,v21.2d
1195	ld1	{v24.2d},[x3],#16
1196	ext	v25.16b,v25.16b,v25.16b,#8
1197	ext	v5.16b,v2.16b,v3.16b,#8
1198	ext	v6.16b,v1.16b,v2.16b,#8
1199	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1200.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1201	ext	v7.16b,v17.16b,v18.16b,#8
1202.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1203.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1204	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1205.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1206	add	v24.2d,v24.2d,v22.2d
1207	ld1	{v25.2d},[x3],#16
1208	ext	v24.16b,v24.16b,v24.16b,#8
1209	ext	v5.16b,v4.16b,v2.16b,#8
1210	ext	v6.16b,v0.16b,v4.16b,#8
1211	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1212.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1213	ext	v7.16b,v18.16b,v19.16b,#8
1214.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1215.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1216	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1217.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1218	add	v25.2d,v25.2d,v23.2d
1219	ld1	{v24.2d},[x3],#16
1220	ext	v25.16b,v25.16b,v25.16b,#8
1221	ext	v5.16b,v1.16b,v4.16b,#8
1222	ext	v6.16b,v3.16b,v1.16b,#8
1223	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1224.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1225	ext	v7.16b,v19.16b,v20.16b,#8
1226.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1227.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1228	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1229.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1230	add	v24.2d,v24.2d,v16.2d
1231	ld1	{v25.2d},[x3],#16
1232	ext	v24.16b,v24.16b,v24.16b,#8
1233	ext	v5.16b,v0.16b,v1.16b,#8
1234	ext	v6.16b,v2.16b,v0.16b,#8
1235	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1236.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1237	ext	v7.16b,v20.16b,v21.16b,#8
1238.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1239.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1240	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1241.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1242	add	v25.2d,v25.2d,v17.2d
1243	ld1	{v24.2d},[x3],#16
1244	ext	v25.16b,v25.16b,v25.16b,#8
1245	ext	v5.16b,v3.16b,v0.16b,#8
1246	ext	v6.16b,v4.16b,v3.16b,#8
1247	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1248.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1249	ext	v7.16b,v21.16b,v22.16b,#8
1250.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1251.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1252	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1253.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1254	add	v24.2d,v24.2d,v18.2d
1255	ld1	{v25.2d},[x3],#16
1256	ext	v24.16b,v24.16b,v24.16b,#8
1257	ext	v5.16b,v2.16b,v3.16b,#8
1258	ext	v6.16b,v1.16b,v2.16b,#8
1259	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1260.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1261	ext	v7.16b,v22.16b,v23.16b,#8
1262.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1263.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1264	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1265.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1266	add	v25.2d,v25.2d,v19.2d
1267	ld1	{v24.2d},[x3],#16
1268	ext	v25.16b,v25.16b,v25.16b,#8
1269	ext	v5.16b,v4.16b,v2.16b,#8
1270	ext	v6.16b,v0.16b,v4.16b,#8
1271	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1272.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1273	ext	v7.16b,v23.16b,v16.16b,#8
1274.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1275.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1276	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1277.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1278	add	v24.2d,v24.2d,v20.2d
1279	ld1	{v25.2d},[x3],#16
1280	ext	v24.16b,v24.16b,v24.16b,#8
1281	ext	v5.16b,v1.16b,v4.16b,#8
1282	ext	v6.16b,v3.16b,v1.16b,#8
1283	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1284.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1285	ext	v7.16b,v16.16b,v17.16b,#8
1286.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1287.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1288	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1289.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1290	add	v25.2d,v25.2d,v21.2d
1291	ld1	{v24.2d},[x3],#16
1292	ext	v25.16b,v25.16b,v25.16b,#8
1293	ext	v5.16b,v0.16b,v1.16b,#8
1294	ext	v6.16b,v2.16b,v0.16b,#8
1295	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1296.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1297	ext	v7.16b,v17.16b,v18.16b,#8
1298.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1299.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1300	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1301.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1302	add	v24.2d,v24.2d,v22.2d
1303	ld1	{v25.2d},[x3],#16
1304	ext	v24.16b,v24.16b,v24.16b,#8
1305	ext	v5.16b,v3.16b,v0.16b,#8
1306	ext	v6.16b,v4.16b,v3.16b,#8
1307	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1308.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1309	ext	v7.16b,v18.16b,v19.16b,#8
1310.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1311.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1312	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1313.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1314	add	v25.2d,v25.2d,v23.2d
1315	ld1	{v24.2d},[x3],#16
1316	ext	v25.16b,v25.16b,v25.16b,#8
1317	ext	v5.16b,v2.16b,v3.16b,#8
1318	ext	v6.16b,v1.16b,v2.16b,#8
1319	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1320.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1321	ext	v7.16b,v19.16b,v20.16b,#8
1322.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1323.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1324	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1325.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1326	add	v24.2d,v24.2d,v16.2d
1327	ld1	{v25.2d},[x3],#16
1328	ext	v24.16b,v24.16b,v24.16b,#8
1329	ext	v5.16b,v4.16b,v2.16b,#8
1330	ext	v6.16b,v0.16b,v4.16b,#8
1331	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1332.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1333	ext	v7.16b,v20.16b,v21.16b,#8
1334.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1335.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1336	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1337.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1338	add	v25.2d,v25.2d,v17.2d
1339	ld1	{v24.2d},[x3],#16
1340	ext	v25.16b,v25.16b,v25.16b,#8
1341	ext	v5.16b,v1.16b,v4.16b,#8
1342	ext	v6.16b,v3.16b,v1.16b,#8
1343	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1344.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1345	ext	v7.16b,v21.16b,v22.16b,#8
1346.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1347.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1348	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1349.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1350	add	v24.2d,v24.2d,v18.2d
1351	ld1	{v25.2d},[x3],#16
1352	ext	v24.16b,v24.16b,v24.16b,#8
1353	ext	v5.16b,v0.16b,v1.16b,#8
1354	ext	v6.16b,v2.16b,v0.16b,#8
1355	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1356.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1357	ext	v7.16b,v22.16b,v23.16b,#8
1358.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1359.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1360	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1361.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1362	add	v25.2d,v25.2d,v19.2d
1363	ld1	{v24.2d},[x3],#16
1364	ext	v25.16b,v25.16b,v25.16b,#8
1365	ext	v5.16b,v3.16b,v0.16b,#8
1366	ext	v6.16b,v4.16b,v3.16b,#8
1367	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1368.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1369	ext	v7.16b,v23.16b,v16.16b,#8
1370.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1371.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1372	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1373.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1374	add	v24.2d,v24.2d,v20.2d
1375	ld1	{v25.2d},[x3],#16
1376	ext	v24.16b,v24.16b,v24.16b,#8
1377	ext	v5.16b,v2.16b,v3.16b,#8
1378	ext	v6.16b,v1.16b,v2.16b,#8
1379	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1380.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1381	ext	v7.16b,v16.16b,v17.16b,#8
1382.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1383.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1384	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1385.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1386	add	v25.2d,v25.2d,v21.2d
1387	ld1	{v24.2d},[x3],#16
1388	ext	v25.16b,v25.16b,v25.16b,#8
1389	ext	v5.16b,v4.16b,v2.16b,#8
1390	ext	v6.16b,v0.16b,v4.16b,#8
1391	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1392.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1393	ext	v7.16b,v17.16b,v18.16b,#8
1394.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1395.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1396	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1397.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1398	add	v24.2d,v24.2d,v22.2d
1399	ld1	{v25.2d},[x3],#16
1400	ext	v24.16b,v24.16b,v24.16b,#8
1401	ext	v5.16b,v1.16b,v4.16b,#8
1402	ext	v6.16b,v3.16b,v1.16b,#8
1403	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1404.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1405	ext	v7.16b,v18.16b,v19.16b,#8
1406.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1407.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1408	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1409.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1410	add	v25.2d,v25.2d,v23.2d
1411	ld1	{v24.2d},[x3],#16
1412	ext	v25.16b,v25.16b,v25.16b,#8
1413	ext	v5.16b,v0.16b,v1.16b,#8
1414	ext	v6.16b,v2.16b,v0.16b,#8
1415	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1416.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1417	ext	v7.16b,v19.16b,v20.16b,#8
1418.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1419.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1420	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1421.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1422	add	v24.2d,v24.2d,v16.2d
1423	ld1	{v25.2d},[x3],#16
1424	ext	v24.16b,v24.16b,v24.16b,#8
1425	ext	v5.16b,v3.16b,v0.16b,#8
1426	ext	v6.16b,v4.16b,v3.16b,#8
1427	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1428.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1429	ext	v7.16b,v20.16b,v21.16b,#8
1430.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1431.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1432	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1433.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1434	add	v25.2d,v25.2d,v17.2d
1435	ld1	{v24.2d},[x3],#16
1436	ext	v25.16b,v25.16b,v25.16b,#8
1437	ext	v5.16b,v2.16b,v3.16b,#8
1438	ext	v6.16b,v1.16b,v2.16b,#8
1439	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1440.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1441	ext	v7.16b,v21.16b,v22.16b,#8
1442.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1443.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1444	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1445.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1446	add	v24.2d,v24.2d,v18.2d
1447	ld1	{v25.2d},[x3],#16
1448	ext	v24.16b,v24.16b,v24.16b,#8
1449	ext	v5.16b,v4.16b,v2.16b,#8
1450	ext	v6.16b,v0.16b,v4.16b,#8
1451	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1452.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1453	ext	v7.16b,v22.16b,v23.16b,#8
1454.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1455.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1456	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1457.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1458	add	v25.2d,v25.2d,v19.2d
1459	ld1	{v24.2d},[x3],#16
1460	ext	v25.16b,v25.16b,v25.16b,#8
1461	ext	v5.16b,v1.16b,v4.16b,#8
1462	ext	v6.16b,v3.16b,v1.16b,#8
1463	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1464.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1465	ext	v7.16b,v23.16b,v16.16b,#8
1466.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1467.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1468	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1469.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1470	add	v24.2d,v24.2d,v20.2d
1471	ld1	{v25.2d},[x3],#16
1472	ext	v24.16b,v24.16b,v24.16b,#8
1473	ext	v5.16b,v0.16b,v1.16b,#8
1474	ext	v6.16b,v2.16b,v0.16b,#8
1475	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1476.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1477	ext	v7.16b,v16.16b,v17.16b,#8
1478.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1479.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1480	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1481.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1482	add	v25.2d,v25.2d,v21.2d
1483	ld1	{v24.2d},[x3],#16
1484	ext	v25.16b,v25.16b,v25.16b,#8
1485	ext	v5.16b,v3.16b,v0.16b,#8
1486	ext	v6.16b,v4.16b,v3.16b,#8
1487	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1488.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1489	ext	v7.16b,v17.16b,v18.16b,#8
1490.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1491.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1492	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1493.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1494	add	v24.2d,v24.2d,v22.2d
1495	ld1	{v25.2d},[x3],#16
1496	ext	v24.16b,v24.16b,v24.16b,#8
1497	ext	v5.16b,v2.16b,v3.16b,#8
1498	ext	v6.16b,v1.16b,v2.16b,#8
1499	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1500.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1501	ext	v7.16b,v18.16b,v19.16b,#8
1502.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1503.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1504	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1505.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1506	add	v25.2d,v25.2d,v23.2d
1507	ld1	{v24.2d},[x3],#16
1508	ext	v25.16b,v25.16b,v25.16b,#8
1509	ext	v5.16b,v4.16b,v2.16b,#8
1510	ext	v6.16b,v0.16b,v4.16b,#8
1511	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1512.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1513	ext	v7.16b,v19.16b,v20.16b,#8
1514.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1515.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1516	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1517.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1518	ld1	{v25.2d},[x3],#16
1519	add	v24.2d,v24.2d,v16.2d
1520	ld1	{v16.16b},[x1],#16		// load next input
1521	ext	v24.16b,v24.16b,v24.16b,#8
1522	ext	v5.16b,v1.16b,v4.16b,#8
1523	ext	v6.16b,v3.16b,v1.16b,#8
1524	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1525.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1526	rev64	v16.16b,v16.16b
1527	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1528.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1529	ld1	{v24.2d},[x3],#16
1530	add	v25.2d,v25.2d,v17.2d
1531	ld1	{v17.16b},[x1],#16		// load next input
1532	ext	v25.16b,v25.16b,v25.16b,#8
1533	ext	v5.16b,v0.16b,v1.16b,#8
1534	ext	v6.16b,v2.16b,v0.16b,#8
1535	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1536.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1537	rev64	v17.16b,v17.16b
1538	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1539.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1540	ld1	{v25.2d},[x3],#16
1541	add	v24.2d,v24.2d,v18.2d
1542	ld1	{v18.16b},[x1],#16		// load next input
1543	ext	v24.16b,v24.16b,v24.16b,#8
1544	ext	v5.16b,v3.16b,v0.16b,#8
1545	ext	v6.16b,v4.16b,v3.16b,#8
1546	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1547.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1548	rev64	v18.16b,v18.16b
1549	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1550.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1551	ld1	{v24.2d},[x3],#16
1552	add	v25.2d,v25.2d,v19.2d
1553	ld1	{v19.16b},[x1],#16		// load next input
1554	ext	v25.16b,v25.16b,v25.16b,#8
1555	ext	v5.16b,v2.16b,v3.16b,#8
1556	ext	v6.16b,v1.16b,v2.16b,#8
1557	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1558.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1559	rev64	v19.16b,v19.16b
1560	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1561.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1562	ld1	{v25.2d},[x3],#16
1563	add	v24.2d,v24.2d,v20.2d
1564	ld1	{v20.16b},[x1],#16		// load next input
1565	ext	v24.16b,v24.16b,v24.16b,#8
1566	ext	v5.16b,v4.16b,v2.16b,#8
1567	ext	v6.16b,v0.16b,v4.16b,#8
1568	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1569.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1570	rev64	v20.16b,v20.16b
1571	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1572.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1573	ld1	{v24.2d},[x3],#16
1574	add	v25.2d,v25.2d,v21.2d
1575	ld1	{v21.16b},[x1],#16		// load next input
1576	ext	v25.16b,v25.16b,v25.16b,#8
1577	ext	v5.16b,v1.16b,v4.16b,#8
1578	ext	v6.16b,v3.16b,v1.16b,#8
1579	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1580.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1581	rev64	v21.16b,v21.16b
1582	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1583.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1584	ld1	{v25.2d},[x3],#16
1585	add	v24.2d,v24.2d,v22.2d
1586	ld1	{v22.16b},[x1],#16		// load next input
1587	ext	v24.16b,v24.16b,v24.16b,#8
1588	ext	v5.16b,v0.16b,v1.16b,#8
1589	ext	v6.16b,v2.16b,v0.16b,#8
1590	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1591.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1592	rev64	v22.16b,v22.16b
1593	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1594.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1595	sub	x3,x3,#80*8	// rewind
1596	add	v25.2d,v25.2d,v23.2d
1597	ld1	{v23.16b},[x1],#16		// load next input
1598	ext	v25.16b,v25.16b,v25.16b,#8
1599	ext	v5.16b,v3.16b,v0.16b,#8
1600	ext	v6.16b,v4.16b,v3.16b,#8
1601	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1602.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1603	rev64	v23.16b,v23.16b
1604	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1605.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1606	add	v0.2d,v0.2d,v26.2d			// accumulate
1607	add	v1.2d,v1.2d,v27.2d
1608	add	v2.2d,v2.2d,v28.2d
1609	add	v3.2d,v3.2d,v29.2d
1610
1611	cbnz	x2,.Loop_hw
1612
1613	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1614
1615	ldr	x29,[sp],#16
1616	ret
1617.size	sha512_block_armv8,.-sha512_block_armv8
1618#endif
1619