• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
17//
18// Licensed under the OpenSSL license (the "License").  You may not use
19// this file except in compliance with the License.  You can obtain a copy
20// in the file LICENSE in the source distribution or at
21// https://www.openssl.org/source/license.html
22
23// ====================================================================
24// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
25// project. The module is, however, dual licensed under OpenSSL and
26// CRYPTOGAMS licenses depending on where you obtain it. For further
27// details see http://www.openssl.org/~appro/cryptogams/.
28//
29// Permission to use under GPLv2 terms is granted.
30// ====================================================================
31//
32// SHA256/512 for ARMv8.
33//
34// Performance in cycles per processed byte and improvement coefficient
35// over code generated with "default" compiler:
36//
37//		SHA256-hw	SHA256(*)	SHA512
38// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
39// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
40// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
41// Denver	2.01		10.5 (+26%)	6.70 (+8%)
42// X-Gene			20.0 (+100%)	12.8 (+300%(***))
43// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
44// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
45//
46// (*)	Software SHA256 results are of lesser relevance, presented
47//	mostly for informational purposes.
48// (**)	The result is a trade-off: it's possible to improve it by
49//	10% (or by 1 cycle per round), but at the cost of 20% loss
50//	on Cortex-A53 (or by 4 cycles per round).
51// (***)	Super-impressive coefficients over gcc-generated code are
52//	indication of some compiler "pathology", most notably code
53//	generated with -mgeneral-regs-only is significantly faster
54//	and the gap is only 40-90%.
55
56#ifndef	__KERNEL__
57# include <openssl/arm_arch.h>
58#endif
59
60.text
61
62
63
64.globl	sha512_block_data_order
65
66.def sha512_block_data_order
67   .type 32
68.endef
69.align	6
70sha512_block_data_order:
71	AARCH64_VALID_CALL_TARGET
72#ifndef	__KERNEL__
73#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
74	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
75#else
76	adrp	x16,OPENSSL_armcap_P
77#endif
78	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
79	tst	w16,#ARMV8_SHA512
80	b.ne	Lv8_entry
81#endif
82	AARCH64_SIGN_LINK_REGISTER
83	stp	x29,x30,[sp,#-128]!
84	add	x29,sp,#0
85
86	stp	x19,x20,[sp,#16]
87	stp	x21,x22,[sp,#32]
88	stp	x23,x24,[sp,#48]
89	stp	x25,x26,[sp,#64]
90	stp	x27,x28,[sp,#80]
91	sub	sp,sp,#4*8
92
93	ldp	x20,x21,[x0]				// load context
94	ldp	x22,x23,[x0,#2*8]
95	ldp	x24,x25,[x0,#4*8]
96	add	x2,x1,x2,lsl#7	// end of input
97	ldp	x26,x27,[x0,#6*8]
98	adrp	x30,LK512
99	add	x30,x30,:lo12:LK512
100	stp	x0,x2,[x29,#96]
101
102Loop:
103	ldp	x3,x4,[x1],#2*8
104	ldr	x19,[x30],#8			// *K++
105	eor	x28,x21,x22				// magic seed
106	str	x1,[x29,#112]
107#ifndef	__AARCH64EB__
108	rev	x3,x3			// 0
109#endif
110	ror	x16,x24,#14
111	add	x27,x27,x19			// h+=K[i]
112	eor	x6,x24,x24,ror#23
113	and	x17,x25,x24
114	bic	x19,x26,x24
115	add	x27,x27,x3			// h+=X[i]
116	orr	x17,x17,x19			// Ch(e,f,g)
117	eor	x19,x20,x21			// a^b, b^c in next round
118	eor	x16,x16,x6,ror#18	// Sigma1(e)
119	ror	x6,x20,#28
120	add	x27,x27,x17			// h+=Ch(e,f,g)
121	eor	x17,x20,x20,ror#5
122	add	x27,x27,x16			// h+=Sigma1(e)
123	and	x28,x28,x19			// (b^c)&=(a^b)
124	add	x23,x23,x27			// d+=h
125	eor	x28,x28,x21			// Maj(a,b,c)
126	eor	x17,x6,x17,ror#34	// Sigma0(a)
127	add	x27,x27,x28			// h+=Maj(a,b,c)
128	ldr	x28,[x30],#8		// *K++, x19 in next round
129	//add	x27,x27,x17			// h+=Sigma0(a)
130#ifndef	__AARCH64EB__
131	rev	x4,x4			// 1
132#endif
133	ldp	x5,x6,[x1],#2*8
134	add	x27,x27,x17			// h+=Sigma0(a)
135	ror	x16,x23,#14
136	add	x26,x26,x28			// h+=K[i]
137	eor	x7,x23,x23,ror#23
138	and	x17,x24,x23
139	bic	x28,x25,x23
140	add	x26,x26,x4			// h+=X[i]
141	orr	x17,x17,x28			// Ch(e,f,g)
142	eor	x28,x27,x20			// a^b, b^c in next round
143	eor	x16,x16,x7,ror#18	// Sigma1(e)
144	ror	x7,x27,#28
145	add	x26,x26,x17			// h+=Ch(e,f,g)
146	eor	x17,x27,x27,ror#5
147	add	x26,x26,x16			// h+=Sigma1(e)
148	and	x19,x19,x28			// (b^c)&=(a^b)
149	add	x22,x22,x26			// d+=h
150	eor	x19,x19,x20			// Maj(a,b,c)
151	eor	x17,x7,x17,ror#34	// Sigma0(a)
152	add	x26,x26,x19			// h+=Maj(a,b,c)
153	ldr	x19,[x30],#8		// *K++, x28 in next round
154	//add	x26,x26,x17			// h+=Sigma0(a)
155#ifndef	__AARCH64EB__
156	rev	x5,x5			// 2
157#endif
158	add	x26,x26,x17			// h+=Sigma0(a)
159	ror	x16,x22,#14
160	add	x25,x25,x19			// h+=K[i]
161	eor	x8,x22,x22,ror#23
162	and	x17,x23,x22
163	bic	x19,x24,x22
164	add	x25,x25,x5			// h+=X[i]
165	orr	x17,x17,x19			// Ch(e,f,g)
166	eor	x19,x26,x27			// a^b, b^c in next round
167	eor	x16,x16,x8,ror#18	// Sigma1(e)
168	ror	x8,x26,#28
169	add	x25,x25,x17			// h+=Ch(e,f,g)
170	eor	x17,x26,x26,ror#5
171	add	x25,x25,x16			// h+=Sigma1(e)
172	and	x28,x28,x19			// (b^c)&=(a^b)
173	add	x21,x21,x25			// d+=h
174	eor	x28,x28,x27			// Maj(a,b,c)
175	eor	x17,x8,x17,ror#34	// Sigma0(a)
176	add	x25,x25,x28			// h+=Maj(a,b,c)
177	ldr	x28,[x30],#8		// *K++, x19 in next round
178	//add	x25,x25,x17			// h+=Sigma0(a)
179#ifndef	__AARCH64EB__
180	rev	x6,x6			// 3
181#endif
182	ldp	x7,x8,[x1],#2*8
183	add	x25,x25,x17			// h+=Sigma0(a)
184	ror	x16,x21,#14
185	add	x24,x24,x28			// h+=K[i]
186	eor	x9,x21,x21,ror#23
187	and	x17,x22,x21
188	bic	x28,x23,x21
189	add	x24,x24,x6			// h+=X[i]
190	orr	x17,x17,x28			// Ch(e,f,g)
191	eor	x28,x25,x26			// a^b, b^c in next round
192	eor	x16,x16,x9,ror#18	// Sigma1(e)
193	ror	x9,x25,#28
194	add	x24,x24,x17			// h+=Ch(e,f,g)
195	eor	x17,x25,x25,ror#5
196	add	x24,x24,x16			// h+=Sigma1(e)
197	and	x19,x19,x28			// (b^c)&=(a^b)
198	add	x20,x20,x24			// d+=h
199	eor	x19,x19,x26			// Maj(a,b,c)
200	eor	x17,x9,x17,ror#34	// Sigma0(a)
201	add	x24,x24,x19			// h+=Maj(a,b,c)
202	ldr	x19,[x30],#8		// *K++, x28 in next round
203	//add	x24,x24,x17			// h+=Sigma0(a)
204#ifndef	__AARCH64EB__
205	rev	x7,x7			// 4
206#endif
207	add	x24,x24,x17			// h+=Sigma0(a)
208	ror	x16,x20,#14
209	add	x23,x23,x19			// h+=K[i]
210	eor	x10,x20,x20,ror#23
211	and	x17,x21,x20
212	bic	x19,x22,x20
213	add	x23,x23,x7			// h+=X[i]
214	orr	x17,x17,x19			// Ch(e,f,g)
215	eor	x19,x24,x25			// a^b, b^c in next round
216	eor	x16,x16,x10,ror#18	// Sigma1(e)
217	ror	x10,x24,#28
218	add	x23,x23,x17			// h+=Ch(e,f,g)
219	eor	x17,x24,x24,ror#5
220	add	x23,x23,x16			// h+=Sigma1(e)
221	and	x28,x28,x19			// (b^c)&=(a^b)
222	add	x27,x27,x23			// d+=h
223	eor	x28,x28,x25			// Maj(a,b,c)
224	eor	x17,x10,x17,ror#34	// Sigma0(a)
225	add	x23,x23,x28			// h+=Maj(a,b,c)
226	ldr	x28,[x30],#8		// *K++, x19 in next round
227	//add	x23,x23,x17			// h+=Sigma0(a)
228#ifndef	__AARCH64EB__
229	rev	x8,x8			// 5
230#endif
231	ldp	x9,x10,[x1],#2*8
232	add	x23,x23,x17			// h+=Sigma0(a)
233	ror	x16,x27,#14
234	add	x22,x22,x28			// h+=K[i]
235	eor	x11,x27,x27,ror#23
236	and	x17,x20,x27
237	bic	x28,x21,x27
238	add	x22,x22,x8			// h+=X[i]
239	orr	x17,x17,x28			// Ch(e,f,g)
240	eor	x28,x23,x24			// a^b, b^c in next round
241	eor	x16,x16,x11,ror#18	// Sigma1(e)
242	ror	x11,x23,#28
243	add	x22,x22,x17			// h+=Ch(e,f,g)
244	eor	x17,x23,x23,ror#5
245	add	x22,x22,x16			// h+=Sigma1(e)
246	and	x19,x19,x28			// (b^c)&=(a^b)
247	add	x26,x26,x22			// d+=h
248	eor	x19,x19,x24			// Maj(a,b,c)
249	eor	x17,x11,x17,ror#34	// Sigma0(a)
250	add	x22,x22,x19			// h+=Maj(a,b,c)
251	ldr	x19,[x30],#8		// *K++, x28 in next round
252	//add	x22,x22,x17			// h+=Sigma0(a)
253#ifndef	__AARCH64EB__
254	rev	x9,x9			// 6
255#endif
256	add	x22,x22,x17			// h+=Sigma0(a)
257	ror	x16,x26,#14
258	add	x21,x21,x19			// h+=K[i]
259	eor	x12,x26,x26,ror#23
260	and	x17,x27,x26
261	bic	x19,x20,x26
262	add	x21,x21,x9			// h+=X[i]
263	orr	x17,x17,x19			// Ch(e,f,g)
264	eor	x19,x22,x23			// a^b, b^c in next round
265	eor	x16,x16,x12,ror#18	// Sigma1(e)
266	ror	x12,x22,#28
267	add	x21,x21,x17			// h+=Ch(e,f,g)
268	eor	x17,x22,x22,ror#5
269	add	x21,x21,x16			// h+=Sigma1(e)
270	and	x28,x28,x19			// (b^c)&=(a^b)
271	add	x25,x25,x21			// d+=h
272	eor	x28,x28,x23			// Maj(a,b,c)
273	eor	x17,x12,x17,ror#34	// Sigma0(a)
274	add	x21,x21,x28			// h+=Maj(a,b,c)
275	ldr	x28,[x30],#8		// *K++, x19 in next round
276	//add	x21,x21,x17			// h+=Sigma0(a)
277#ifndef	__AARCH64EB__
278	rev	x10,x10			// 7
279#endif
280	ldp	x11,x12,[x1],#2*8
281	add	x21,x21,x17			// h+=Sigma0(a)
282	ror	x16,x25,#14
283	add	x20,x20,x28			// h+=K[i]
284	eor	x13,x25,x25,ror#23
285	and	x17,x26,x25
286	bic	x28,x27,x25
287	add	x20,x20,x10			// h+=X[i]
288	orr	x17,x17,x28			// Ch(e,f,g)
289	eor	x28,x21,x22			// a^b, b^c in next round
290	eor	x16,x16,x13,ror#18	// Sigma1(e)
291	ror	x13,x21,#28
292	add	x20,x20,x17			// h+=Ch(e,f,g)
293	eor	x17,x21,x21,ror#5
294	add	x20,x20,x16			// h+=Sigma1(e)
295	and	x19,x19,x28			// (b^c)&=(a^b)
296	add	x24,x24,x20			// d+=h
297	eor	x19,x19,x22			// Maj(a,b,c)
298	eor	x17,x13,x17,ror#34	// Sigma0(a)
299	add	x20,x20,x19			// h+=Maj(a,b,c)
300	ldr	x19,[x30],#8		// *K++, x28 in next round
301	//add	x20,x20,x17			// h+=Sigma0(a)
302#ifndef	__AARCH64EB__
303	rev	x11,x11			// 8
304#endif
305	add	x20,x20,x17			// h+=Sigma0(a)
306	ror	x16,x24,#14
307	add	x27,x27,x19			// h+=K[i]
308	eor	x14,x24,x24,ror#23
309	and	x17,x25,x24
310	bic	x19,x26,x24
311	add	x27,x27,x11			// h+=X[i]
312	orr	x17,x17,x19			// Ch(e,f,g)
313	eor	x19,x20,x21			// a^b, b^c in next round
314	eor	x16,x16,x14,ror#18	// Sigma1(e)
315	ror	x14,x20,#28
316	add	x27,x27,x17			// h+=Ch(e,f,g)
317	eor	x17,x20,x20,ror#5
318	add	x27,x27,x16			// h+=Sigma1(e)
319	and	x28,x28,x19			// (b^c)&=(a^b)
320	add	x23,x23,x27			// d+=h
321	eor	x28,x28,x21			// Maj(a,b,c)
322	eor	x17,x14,x17,ror#34	// Sigma0(a)
323	add	x27,x27,x28			// h+=Maj(a,b,c)
324	ldr	x28,[x30],#8		// *K++, x19 in next round
325	//add	x27,x27,x17			// h+=Sigma0(a)
326#ifndef	__AARCH64EB__
327	rev	x12,x12			// 9
328#endif
329	ldp	x13,x14,[x1],#2*8
330	add	x27,x27,x17			// h+=Sigma0(a)
331	ror	x16,x23,#14
332	add	x26,x26,x28			// h+=K[i]
333	eor	x15,x23,x23,ror#23
334	and	x17,x24,x23
335	bic	x28,x25,x23
336	add	x26,x26,x12			// h+=X[i]
337	orr	x17,x17,x28			// Ch(e,f,g)
338	eor	x28,x27,x20			// a^b, b^c in next round
339	eor	x16,x16,x15,ror#18	// Sigma1(e)
340	ror	x15,x27,#28
341	add	x26,x26,x17			// h+=Ch(e,f,g)
342	eor	x17,x27,x27,ror#5
343	add	x26,x26,x16			// h+=Sigma1(e)
344	and	x19,x19,x28			// (b^c)&=(a^b)
345	add	x22,x22,x26			// d+=h
346	eor	x19,x19,x20			// Maj(a,b,c)
347	eor	x17,x15,x17,ror#34	// Sigma0(a)
348	add	x26,x26,x19			// h+=Maj(a,b,c)
349	ldr	x19,[x30],#8		// *K++, x28 in next round
350	//add	x26,x26,x17			// h+=Sigma0(a)
351#ifndef	__AARCH64EB__
352	rev	x13,x13			// 10
353#endif
354	add	x26,x26,x17			// h+=Sigma0(a)
355	ror	x16,x22,#14
356	add	x25,x25,x19			// h+=K[i]
357	eor	x0,x22,x22,ror#23
358	and	x17,x23,x22
359	bic	x19,x24,x22
360	add	x25,x25,x13			// h+=X[i]
361	orr	x17,x17,x19			// Ch(e,f,g)
362	eor	x19,x26,x27			// a^b, b^c in next round
363	eor	x16,x16,x0,ror#18	// Sigma1(e)
364	ror	x0,x26,#28
365	add	x25,x25,x17			// h+=Ch(e,f,g)
366	eor	x17,x26,x26,ror#5
367	add	x25,x25,x16			// h+=Sigma1(e)
368	and	x28,x28,x19			// (b^c)&=(a^b)
369	add	x21,x21,x25			// d+=h
370	eor	x28,x28,x27			// Maj(a,b,c)
371	eor	x17,x0,x17,ror#34	// Sigma0(a)
372	add	x25,x25,x28			// h+=Maj(a,b,c)
373	ldr	x28,[x30],#8		// *K++, x19 in next round
374	//add	x25,x25,x17			// h+=Sigma0(a)
375#ifndef	__AARCH64EB__
376	rev	x14,x14			// 11
377#endif
378	ldp	x15,x0,[x1],#2*8
379	add	x25,x25,x17			// h+=Sigma0(a)
380	str	x6,[sp,#24]
381	ror	x16,x21,#14
382	add	x24,x24,x28			// h+=K[i]
383	eor	x6,x21,x21,ror#23
384	and	x17,x22,x21
385	bic	x28,x23,x21
386	add	x24,x24,x14			// h+=X[i]
387	orr	x17,x17,x28			// Ch(e,f,g)
388	eor	x28,x25,x26			// a^b, b^c in next round
389	eor	x16,x16,x6,ror#18	// Sigma1(e)
390	ror	x6,x25,#28
391	add	x24,x24,x17			// h+=Ch(e,f,g)
392	eor	x17,x25,x25,ror#5
393	add	x24,x24,x16			// h+=Sigma1(e)
394	and	x19,x19,x28			// (b^c)&=(a^b)
395	add	x20,x20,x24			// d+=h
396	eor	x19,x19,x26			// Maj(a,b,c)
397	eor	x17,x6,x17,ror#34	// Sigma0(a)
398	add	x24,x24,x19			// h+=Maj(a,b,c)
399	ldr	x19,[x30],#8		// *K++, x28 in next round
400	//add	x24,x24,x17			// h+=Sigma0(a)
401#ifndef	__AARCH64EB__
402	rev	x15,x15			// 12
403#endif
404	add	x24,x24,x17			// h+=Sigma0(a)
405	str	x7,[sp,#0]
406	ror	x16,x20,#14
407	add	x23,x23,x19			// h+=K[i]
408	eor	x7,x20,x20,ror#23
409	and	x17,x21,x20
410	bic	x19,x22,x20
411	add	x23,x23,x15			// h+=X[i]
412	orr	x17,x17,x19			// Ch(e,f,g)
413	eor	x19,x24,x25			// a^b, b^c in next round
414	eor	x16,x16,x7,ror#18	// Sigma1(e)
415	ror	x7,x24,#28
416	add	x23,x23,x17			// h+=Ch(e,f,g)
417	eor	x17,x24,x24,ror#5
418	add	x23,x23,x16			// h+=Sigma1(e)
419	and	x28,x28,x19			// (b^c)&=(a^b)
420	add	x27,x27,x23			// d+=h
421	eor	x28,x28,x25			// Maj(a,b,c)
422	eor	x17,x7,x17,ror#34	// Sigma0(a)
423	add	x23,x23,x28			// h+=Maj(a,b,c)
424	ldr	x28,[x30],#8		// *K++, x19 in next round
425	//add	x23,x23,x17			// h+=Sigma0(a)
426#ifndef	__AARCH64EB__
427	rev	x0,x0			// 13
428#endif
429	ldp	x1,x2,[x1]
430	add	x23,x23,x17			// h+=Sigma0(a)
431	str	x8,[sp,#8]
432	ror	x16,x27,#14
433	add	x22,x22,x28			// h+=K[i]
434	eor	x8,x27,x27,ror#23
435	and	x17,x20,x27
436	bic	x28,x21,x27
437	add	x22,x22,x0			// h+=X[i]
438	orr	x17,x17,x28			// Ch(e,f,g)
439	eor	x28,x23,x24			// a^b, b^c in next round
440	eor	x16,x16,x8,ror#18	// Sigma1(e)
441	ror	x8,x23,#28
442	add	x22,x22,x17			// h+=Ch(e,f,g)
443	eor	x17,x23,x23,ror#5
444	add	x22,x22,x16			// h+=Sigma1(e)
445	and	x19,x19,x28			// (b^c)&=(a^b)
446	add	x26,x26,x22			// d+=h
447	eor	x19,x19,x24			// Maj(a,b,c)
448	eor	x17,x8,x17,ror#34	// Sigma0(a)
449	add	x22,x22,x19			// h+=Maj(a,b,c)
450	ldr	x19,[x30],#8		// *K++, x28 in next round
451	//add	x22,x22,x17			// h+=Sigma0(a)
452#ifndef	__AARCH64EB__
453	rev	x1,x1			// 14
454#endif
455	ldr	x6,[sp,#24]
456	add	x22,x22,x17			// h+=Sigma0(a)
457	str	x9,[sp,#16]
458	ror	x16,x26,#14
459	add	x21,x21,x19			// h+=K[i]
460	eor	x9,x26,x26,ror#23
461	and	x17,x27,x26
462	bic	x19,x20,x26
463	add	x21,x21,x1			// h+=X[i]
464	orr	x17,x17,x19			// Ch(e,f,g)
465	eor	x19,x22,x23			// a^b, b^c in next round
466	eor	x16,x16,x9,ror#18	// Sigma1(e)
467	ror	x9,x22,#28
468	add	x21,x21,x17			// h+=Ch(e,f,g)
469	eor	x17,x22,x22,ror#5
470	add	x21,x21,x16			// h+=Sigma1(e)
471	and	x28,x28,x19			// (b^c)&=(a^b)
472	add	x25,x25,x21			// d+=h
473	eor	x28,x28,x23			// Maj(a,b,c)
474	eor	x17,x9,x17,ror#34	// Sigma0(a)
475	add	x21,x21,x28			// h+=Maj(a,b,c)
476	ldr	x28,[x30],#8		// *K++, x19 in next round
477	//add	x21,x21,x17			// h+=Sigma0(a)
478#ifndef	__AARCH64EB__
479	rev	x2,x2			// 15
480#endif
481	ldr	x7,[sp,#0]
482	add	x21,x21,x17			// h+=Sigma0(a)
483	str	x10,[sp,#24]
484	ror	x16,x25,#14
485	add	x20,x20,x28			// h+=K[i]
486	ror	x9,x4,#1
487	and	x17,x26,x25
488	ror	x8,x1,#19
489	bic	x28,x27,x25
490	ror	x10,x21,#28
491	add	x20,x20,x2			// h+=X[i]
492	eor	x16,x16,x25,ror#18
493	eor	x9,x9,x4,ror#8
494	orr	x17,x17,x28			// Ch(e,f,g)
495	eor	x28,x21,x22			// a^b, b^c in next round
496	eor	x16,x16,x25,ror#41	// Sigma1(e)
497	eor	x10,x10,x21,ror#34
498	add	x20,x20,x17			// h+=Ch(e,f,g)
499	and	x19,x19,x28			// (b^c)&=(a^b)
500	eor	x8,x8,x1,ror#61
501	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
502	add	x20,x20,x16			// h+=Sigma1(e)
503	eor	x19,x19,x22			// Maj(a,b,c)
504	eor	x17,x10,x21,ror#39	// Sigma0(a)
505	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
506	add	x3,x3,x12
507	add	x24,x24,x20			// d+=h
508	add	x20,x20,x19			// h+=Maj(a,b,c)
509	ldr	x19,[x30],#8		// *K++, x28 in next round
510	add	x3,x3,x9
511	add	x20,x20,x17			// h+=Sigma0(a)
512	add	x3,x3,x8
513Loop_16_xx:
514	ldr	x8,[sp,#8]
515	str	x11,[sp,#0]
516	ror	x16,x24,#14
517	add	x27,x27,x19			// h+=K[i]
518	ror	x10,x5,#1
519	and	x17,x25,x24
520	ror	x9,x2,#19
521	bic	x19,x26,x24
522	ror	x11,x20,#28
523	add	x27,x27,x3			// h+=X[i]
524	eor	x16,x16,x24,ror#18
525	eor	x10,x10,x5,ror#8
526	orr	x17,x17,x19			// Ch(e,f,g)
527	eor	x19,x20,x21			// a^b, b^c in next round
528	eor	x16,x16,x24,ror#41	// Sigma1(e)
529	eor	x11,x11,x20,ror#34
530	add	x27,x27,x17			// h+=Ch(e,f,g)
531	and	x28,x28,x19			// (b^c)&=(a^b)
532	eor	x9,x9,x2,ror#61
533	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
534	add	x27,x27,x16			// h+=Sigma1(e)
535	eor	x28,x28,x21			// Maj(a,b,c)
536	eor	x17,x11,x20,ror#39	// Sigma0(a)
537	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
538	add	x4,x4,x13
539	add	x23,x23,x27			// d+=h
540	add	x27,x27,x28			// h+=Maj(a,b,c)
541	ldr	x28,[x30],#8		// *K++, x19 in next round
542	add	x4,x4,x10
543	add	x27,x27,x17			// h+=Sigma0(a)
544	add	x4,x4,x9
545	ldr	x9,[sp,#16]
546	str	x12,[sp,#8]
547	ror	x16,x23,#14
548	add	x26,x26,x28			// h+=K[i]
549	ror	x11,x6,#1
550	and	x17,x24,x23
551	ror	x10,x3,#19
552	bic	x28,x25,x23
553	ror	x12,x27,#28
554	add	x26,x26,x4			// h+=X[i]
555	eor	x16,x16,x23,ror#18
556	eor	x11,x11,x6,ror#8
557	orr	x17,x17,x28			// Ch(e,f,g)
558	eor	x28,x27,x20			// a^b, b^c in next round
559	eor	x16,x16,x23,ror#41	// Sigma1(e)
560	eor	x12,x12,x27,ror#34
561	add	x26,x26,x17			// h+=Ch(e,f,g)
562	and	x19,x19,x28			// (b^c)&=(a^b)
563	eor	x10,x10,x3,ror#61
564	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
565	add	x26,x26,x16			// h+=Sigma1(e)
566	eor	x19,x19,x20			// Maj(a,b,c)
567	eor	x17,x12,x27,ror#39	// Sigma0(a)
568	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
569	add	x5,x5,x14
570	add	x22,x22,x26			// d+=h
571	add	x26,x26,x19			// h+=Maj(a,b,c)
572	ldr	x19,[x30],#8		// *K++, x28 in next round
573	add	x5,x5,x11
574	add	x26,x26,x17			// h+=Sigma0(a)
575	add	x5,x5,x10
576	ldr	x10,[sp,#24]
577	str	x13,[sp,#16]
578	ror	x16,x22,#14
579	add	x25,x25,x19			// h+=K[i]
580	ror	x12,x7,#1
581	and	x17,x23,x22
582	ror	x11,x4,#19
583	bic	x19,x24,x22
584	ror	x13,x26,#28
585	add	x25,x25,x5			// h+=X[i]
586	eor	x16,x16,x22,ror#18
587	eor	x12,x12,x7,ror#8
588	orr	x17,x17,x19			// Ch(e,f,g)
589	eor	x19,x26,x27			// a^b, b^c in next round
590	eor	x16,x16,x22,ror#41	// Sigma1(e)
591	eor	x13,x13,x26,ror#34
592	add	x25,x25,x17			// h+=Ch(e,f,g)
593	and	x28,x28,x19			// (b^c)&=(a^b)
594	eor	x11,x11,x4,ror#61
595	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
596	add	x25,x25,x16			// h+=Sigma1(e)
597	eor	x28,x28,x27			// Maj(a,b,c)
598	eor	x17,x13,x26,ror#39	// Sigma0(a)
599	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
600	add	x6,x6,x15
601	add	x21,x21,x25			// d+=h
602	add	x25,x25,x28			// h+=Maj(a,b,c)
603	ldr	x28,[x30],#8		// *K++, x19 in next round
604	add	x6,x6,x12
605	add	x25,x25,x17			// h+=Sigma0(a)
606	add	x6,x6,x11
607	ldr	x11,[sp,#0]
608	str	x14,[sp,#24]
609	ror	x16,x21,#14
610	add	x24,x24,x28			// h+=K[i]
611	ror	x13,x8,#1
612	and	x17,x22,x21
613	ror	x12,x5,#19
614	bic	x28,x23,x21
615	ror	x14,x25,#28
616	add	x24,x24,x6			// h+=X[i]
617	eor	x16,x16,x21,ror#18
618	eor	x13,x13,x8,ror#8
619	orr	x17,x17,x28			// Ch(e,f,g)
620	eor	x28,x25,x26			// a^b, b^c in next round
621	eor	x16,x16,x21,ror#41	// Sigma1(e)
622	eor	x14,x14,x25,ror#34
623	add	x24,x24,x17			// h+=Ch(e,f,g)
624	and	x19,x19,x28			// (b^c)&=(a^b)
625	eor	x12,x12,x5,ror#61
626	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
627	add	x24,x24,x16			// h+=Sigma1(e)
628	eor	x19,x19,x26			// Maj(a,b,c)
629	eor	x17,x14,x25,ror#39	// Sigma0(a)
630	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
631	add	x7,x7,x0
632	add	x20,x20,x24			// d+=h
633	add	x24,x24,x19			// h+=Maj(a,b,c)
634	ldr	x19,[x30],#8		// *K++, x28 in next round
635	add	x7,x7,x13
636	add	x24,x24,x17			// h+=Sigma0(a)
637	add	x7,x7,x12
638	ldr	x12,[sp,#8]
639	str	x15,[sp,#0]
640	ror	x16,x20,#14
641	add	x23,x23,x19			// h+=K[i]
642	ror	x14,x9,#1
643	and	x17,x21,x20
644	ror	x13,x6,#19
645	bic	x19,x22,x20
646	ror	x15,x24,#28
647	add	x23,x23,x7			// h+=X[i]
648	eor	x16,x16,x20,ror#18
649	eor	x14,x14,x9,ror#8
650	orr	x17,x17,x19			// Ch(e,f,g)
651	eor	x19,x24,x25			// a^b, b^c in next round
652	eor	x16,x16,x20,ror#41	// Sigma1(e)
653	eor	x15,x15,x24,ror#34
654	add	x23,x23,x17			// h+=Ch(e,f,g)
655	and	x28,x28,x19			// (b^c)&=(a^b)
656	eor	x13,x13,x6,ror#61
657	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
658	add	x23,x23,x16			// h+=Sigma1(e)
659	eor	x28,x28,x25			// Maj(a,b,c)
660	eor	x17,x15,x24,ror#39	// Sigma0(a)
661	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
662	add	x8,x8,x1
663	add	x27,x27,x23			// d+=h
664	add	x23,x23,x28			// h+=Maj(a,b,c)
665	ldr	x28,[x30],#8		// *K++, x19 in next round
666	add	x8,x8,x14
667	add	x23,x23,x17			// h+=Sigma0(a)
668	add	x8,x8,x13
669	ldr	x13,[sp,#16]
670	str	x0,[sp,#8]
671	ror	x16,x27,#14
672	add	x22,x22,x28			// h+=K[i]
673	ror	x15,x10,#1
674	and	x17,x20,x27
675	ror	x14,x7,#19
676	bic	x28,x21,x27
677	ror	x0,x23,#28
678	add	x22,x22,x8			// h+=X[i]
679	eor	x16,x16,x27,ror#18
680	eor	x15,x15,x10,ror#8
681	orr	x17,x17,x28			// Ch(e,f,g)
682	eor	x28,x23,x24			// a^b, b^c in next round
683	eor	x16,x16,x27,ror#41	// Sigma1(e)
684	eor	x0,x0,x23,ror#34
685	add	x22,x22,x17			// h+=Ch(e,f,g)
686	and	x19,x19,x28			// (b^c)&=(a^b)
687	eor	x14,x14,x7,ror#61
688	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
689	add	x22,x22,x16			// h+=Sigma1(e)
690	eor	x19,x19,x24			// Maj(a,b,c)
691	eor	x17,x0,x23,ror#39	// Sigma0(a)
692	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
693	add	x9,x9,x2
694	add	x26,x26,x22			// d+=h
695	add	x22,x22,x19			// h+=Maj(a,b,c)
696	ldr	x19,[x30],#8		// *K++, x28 in next round
697	add	x9,x9,x15
698	add	x22,x22,x17			// h+=Sigma0(a)
699	add	x9,x9,x14
700	ldr	x14,[sp,#24]
701	str	x1,[sp,#16]
702	ror	x16,x26,#14
703	add	x21,x21,x19			// h+=K[i]
704	ror	x0,x11,#1
705	and	x17,x27,x26
706	ror	x15,x8,#19
707	bic	x19,x20,x26
708	ror	x1,x22,#28
709	add	x21,x21,x9			// h+=X[i]
710	eor	x16,x16,x26,ror#18
711	eor	x0,x0,x11,ror#8
712	orr	x17,x17,x19			// Ch(e,f,g)
713	eor	x19,x22,x23			// a^b, b^c in next round
714	eor	x16,x16,x26,ror#41	// Sigma1(e)
715	eor	x1,x1,x22,ror#34
716	add	x21,x21,x17			// h+=Ch(e,f,g)
717	and	x28,x28,x19			// (b^c)&=(a^b)
718	eor	x15,x15,x8,ror#61
719	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
720	add	x21,x21,x16			// h+=Sigma1(e)
721	eor	x28,x28,x23			// Maj(a,b,c)
722	eor	x17,x1,x22,ror#39	// Sigma0(a)
723	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
724	add	x10,x10,x3
725	add	x25,x25,x21			// d+=h
726	add	x21,x21,x28			// h+=Maj(a,b,c)
727	ldr	x28,[x30],#8		// *K++, x19 in next round
728	add	x10,x10,x0
729	add	x21,x21,x17			// h+=Sigma0(a)
730	add	x10,x10,x15
731	ldr	x15,[sp,#0]
732	str	x2,[sp,#24]
733	ror	x16,x25,#14
734	add	x20,x20,x28			// h+=K[i]
735	ror	x1,x12,#1
736	and	x17,x26,x25
737	ror	x0,x9,#19
738	bic	x28,x27,x25
739	ror	x2,x21,#28
740	add	x20,x20,x10			// h+=X[i]
741	eor	x16,x16,x25,ror#18
742	eor	x1,x1,x12,ror#8
743	orr	x17,x17,x28			// Ch(e,f,g)
744	eor	x28,x21,x22			// a^b, b^c in next round
745	eor	x16,x16,x25,ror#41	// Sigma1(e)
746	eor	x2,x2,x21,ror#34
747	add	x20,x20,x17			// h+=Ch(e,f,g)
748	and	x19,x19,x28			// (b^c)&=(a^b)
749	eor	x0,x0,x9,ror#61
750	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
751	add	x20,x20,x16			// h+=Sigma1(e)
752	eor	x19,x19,x22			// Maj(a,b,c)
753	eor	x17,x2,x21,ror#39	// Sigma0(a)
754	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
755	add	x11,x11,x4
756	add	x24,x24,x20			// d+=h
757	add	x20,x20,x19			// h+=Maj(a,b,c)
758	ldr	x19,[x30],#8		// *K++, x28 in next round
759	add	x11,x11,x1
760	add	x20,x20,x17			// h+=Sigma0(a)
761	add	x11,x11,x0
762	ldr	x0,[sp,#8]
763	str	x3,[sp,#0]
764	ror	x16,x24,#14
765	add	x27,x27,x19			// h+=K[i]
766	ror	x2,x13,#1
767	and	x17,x25,x24
768	ror	x1,x10,#19
769	bic	x19,x26,x24
770	ror	x3,x20,#28
771	add	x27,x27,x11			// h+=X[i]
772	eor	x16,x16,x24,ror#18
773	eor	x2,x2,x13,ror#8
774	orr	x17,x17,x19			// Ch(e,f,g)
775	eor	x19,x20,x21			// a^b, b^c in next round
776	eor	x16,x16,x24,ror#41	// Sigma1(e)
777	eor	x3,x3,x20,ror#34
778	add	x27,x27,x17			// h+=Ch(e,f,g)
779	and	x28,x28,x19			// (b^c)&=(a^b)
780	eor	x1,x1,x10,ror#61
781	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
782	add	x27,x27,x16			// h+=Sigma1(e)
783	eor	x28,x28,x21			// Maj(a,b,c)
784	eor	x17,x3,x20,ror#39	// Sigma0(a)
785	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
786	add	x12,x12,x5
787	add	x23,x23,x27			// d+=h
788	add	x27,x27,x28			// h+=Maj(a,b,c)
789	ldr	x28,[x30],#8		// *K++, x19 in next round
790	add	x12,x12,x2
791	add	x27,x27,x17			// h+=Sigma0(a)
792	add	x12,x12,x1
793	ldr	x1,[sp,#16]
794	str	x4,[sp,#8]
795	ror	x16,x23,#14
796	add	x26,x26,x28			// h+=K[i]
797	ror	x3,x14,#1
798	and	x17,x24,x23
799	ror	x2,x11,#19
800	bic	x28,x25,x23
801	ror	x4,x27,#28
802	add	x26,x26,x12			// h+=X[i]
803	eor	x16,x16,x23,ror#18
804	eor	x3,x3,x14,ror#8
805	orr	x17,x17,x28			// Ch(e,f,g)
806	eor	x28,x27,x20			// a^b, b^c in next round
807	eor	x16,x16,x23,ror#41	// Sigma1(e)
808	eor	x4,x4,x27,ror#34
809	add	x26,x26,x17			// h+=Ch(e,f,g)
810	and	x19,x19,x28			// (b^c)&=(a^b)
811	eor	x2,x2,x11,ror#61
812	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
813	add	x26,x26,x16			// h+=Sigma1(e)
814	eor	x19,x19,x20			// Maj(a,b,c)
815	eor	x17,x4,x27,ror#39	// Sigma0(a)
816	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
817	add	x13,x13,x6
818	add	x22,x22,x26			// d+=h
819	add	x26,x26,x19			// h+=Maj(a,b,c)
820	ldr	x19,[x30],#8		// *K++, x28 in next round
821	add	x13,x13,x3
822	add	x26,x26,x17			// h+=Sigma0(a)
823	add	x13,x13,x2
824	ldr	x2,[sp,#24]
825	str	x5,[sp,#16]
826	ror	x16,x22,#14
827	add	x25,x25,x19			// h+=K[i]
828	ror	x4,x15,#1
829	and	x17,x23,x22
830	ror	x3,x12,#19
831	bic	x19,x24,x22
832	ror	x5,x26,#28
833	add	x25,x25,x13			// h+=X[i]
834	eor	x16,x16,x22,ror#18
835	eor	x4,x4,x15,ror#8
836	orr	x17,x17,x19			// Ch(e,f,g)
837	eor	x19,x26,x27			// a^b, b^c in next round
838	eor	x16,x16,x22,ror#41	// Sigma1(e)
839	eor	x5,x5,x26,ror#34
840	add	x25,x25,x17			// h+=Ch(e,f,g)
841	and	x28,x28,x19			// (b^c)&=(a^b)
842	eor	x3,x3,x12,ror#61
843	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
844	add	x25,x25,x16			// h+=Sigma1(e)
845	eor	x28,x28,x27			// Maj(a,b,c)
846	eor	x17,x5,x26,ror#39	// Sigma0(a)
847	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
848	add	x14,x14,x7
849	add	x21,x21,x25			// d+=h
850	add	x25,x25,x28			// h+=Maj(a,b,c)
851	ldr	x28,[x30],#8		// *K++, x19 in next round
852	add	x14,x14,x4
853	add	x25,x25,x17			// h+=Sigma0(a)
854	add	x14,x14,x3
855	ldr	x3,[sp,#0]
856	str	x6,[sp,#24]
857	ror	x16,x21,#14
858	add	x24,x24,x28			// h+=K[i]
859	ror	x5,x0,#1
860	and	x17,x22,x21
861	ror	x4,x13,#19
862	bic	x28,x23,x21
863	ror	x6,x25,#28
864	add	x24,x24,x14			// h+=X[i]
865	eor	x16,x16,x21,ror#18
866	eor	x5,x5,x0,ror#8
867	orr	x17,x17,x28			// Ch(e,f,g)
868	eor	x28,x25,x26			// a^b, b^c in next round
869	eor	x16,x16,x21,ror#41	// Sigma1(e)
870	eor	x6,x6,x25,ror#34
871	add	x24,x24,x17			// h+=Ch(e,f,g)
872	and	x19,x19,x28			// (b^c)&=(a^b)
873	eor	x4,x4,x13,ror#61
874	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
875	add	x24,x24,x16			// h+=Sigma1(e)
876	eor	x19,x19,x26			// Maj(a,b,c)
877	eor	x17,x6,x25,ror#39	// Sigma0(a)
878	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
879	add	x15,x15,x8
880	add	x20,x20,x24			// d+=h
881	add	x24,x24,x19			// h+=Maj(a,b,c)
882	ldr	x19,[x30],#8		// *K++, x28 in next round
883	add	x15,x15,x5
884	add	x24,x24,x17			// h+=Sigma0(a)
885	add	x15,x15,x4
886	ldr	x4,[sp,#8]
887	str	x7,[sp,#0]
888	ror	x16,x20,#14
889	add	x23,x23,x19			// h+=K[i]
890	ror	x6,x1,#1
891	and	x17,x21,x20
892	ror	x5,x14,#19
893	bic	x19,x22,x20
894	ror	x7,x24,#28
895	add	x23,x23,x15			// h+=X[i]
896	eor	x16,x16,x20,ror#18
897	eor	x6,x6,x1,ror#8
898	orr	x17,x17,x19			// Ch(e,f,g)
899	eor	x19,x24,x25			// a^b, b^c in next round
900	eor	x16,x16,x20,ror#41	// Sigma1(e)
901	eor	x7,x7,x24,ror#34
902	add	x23,x23,x17			// h+=Ch(e,f,g)
903	and	x28,x28,x19			// (b^c)&=(a^b)
904	eor	x5,x5,x14,ror#61
905	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
906	add	x23,x23,x16			// h+=Sigma1(e)
907	eor	x28,x28,x25			// Maj(a,b,c)
908	eor	x17,x7,x24,ror#39	// Sigma0(a)
909	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
910	add	x0,x0,x9
911	add	x27,x27,x23			// d+=h
912	add	x23,x23,x28			// h+=Maj(a,b,c)
913	ldr	x28,[x30],#8		// *K++, x19 in next round
914	add	x0,x0,x6
915	add	x23,x23,x17			// h+=Sigma0(a)
916	add	x0,x0,x5
917	ldr	x5,[sp,#16]
918	str	x8,[sp,#8]
919	ror	x16,x27,#14
920	add	x22,x22,x28			// h+=K[i]
921	ror	x7,x2,#1
922	and	x17,x20,x27
923	ror	x6,x15,#19
924	bic	x28,x21,x27
925	ror	x8,x23,#28
926	add	x22,x22,x0			// h+=X[i]
927	eor	x16,x16,x27,ror#18
928	eor	x7,x7,x2,ror#8
929	orr	x17,x17,x28			// Ch(e,f,g)
930	eor	x28,x23,x24			// a^b, b^c in next round
931	eor	x16,x16,x27,ror#41	// Sigma1(e)
932	eor	x8,x8,x23,ror#34
933	add	x22,x22,x17			// h+=Ch(e,f,g)
934	and	x19,x19,x28			// (b^c)&=(a^b)
935	eor	x6,x6,x15,ror#61
936	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
937	add	x22,x22,x16			// h+=Sigma1(e)
938	eor	x19,x19,x24			// Maj(a,b,c)
939	eor	x17,x8,x23,ror#39	// Sigma0(a)
940	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
941	add	x1,x1,x10
942	add	x26,x26,x22			// d+=h
943	add	x22,x22,x19			// h+=Maj(a,b,c)
944	ldr	x19,[x30],#8		// *K++, x28 in next round
945	add	x1,x1,x7
946	add	x22,x22,x17			// h+=Sigma0(a)
947	add	x1,x1,x6
948	ldr	x6,[sp,#24]
949	str	x9,[sp,#16]
950	ror	x16,x26,#14
951	add	x21,x21,x19			// h+=K[i]
952	ror	x8,x3,#1
953	and	x17,x27,x26
954	ror	x7,x0,#19
955	bic	x19,x20,x26
956	ror	x9,x22,#28
957	add	x21,x21,x1			// h+=X[i]
958	eor	x16,x16,x26,ror#18
959	eor	x8,x8,x3,ror#8
960	orr	x17,x17,x19			// Ch(e,f,g)
961	eor	x19,x22,x23			// a^b, b^c in next round
962	eor	x16,x16,x26,ror#41	// Sigma1(e)
963	eor	x9,x9,x22,ror#34
964	add	x21,x21,x17			// h+=Ch(e,f,g)
965	and	x28,x28,x19			// (b^c)&=(a^b)
966	eor	x7,x7,x0,ror#61
967	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
968	add	x21,x21,x16			// h+=Sigma1(e)
969	eor	x28,x28,x23			// Maj(a,b,c)
970	eor	x17,x9,x22,ror#39	// Sigma0(a)
971	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
972	add	x2,x2,x11
973	add	x25,x25,x21			// d+=h
974	add	x21,x21,x28			// h+=Maj(a,b,c)
975	ldr	x28,[x30],#8		// *K++, x19 in next round
976	add	x2,x2,x8
977	add	x21,x21,x17			// h+=Sigma0(a)
978	add	x2,x2,x7
979	ldr	x7,[sp,#0]
980	str	x10,[sp,#24]
981	ror	x16,x25,#14
982	add	x20,x20,x28			// h+=K[i]
983	ror	x9,x4,#1
984	and	x17,x26,x25
985	ror	x8,x1,#19
986	bic	x28,x27,x25
987	ror	x10,x21,#28
988	add	x20,x20,x2			// h+=X[i]
989	eor	x16,x16,x25,ror#18
990	eor	x9,x9,x4,ror#8
991	orr	x17,x17,x28			// Ch(e,f,g)
992	eor	x28,x21,x22			// a^b, b^c in next round
993	eor	x16,x16,x25,ror#41	// Sigma1(e)
994	eor	x10,x10,x21,ror#34
995	add	x20,x20,x17			// h+=Ch(e,f,g)
996	and	x19,x19,x28			// (b^c)&=(a^b)
997	eor	x8,x8,x1,ror#61
998	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
999	add	x20,x20,x16			// h+=Sigma1(e)
1000	eor	x19,x19,x22			// Maj(a,b,c)
1001	eor	x17,x10,x21,ror#39	// Sigma0(a)
1002	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1003	add	x3,x3,x12
1004	add	x24,x24,x20			// d+=h
1005	add	x20,x20,x19			// h+=Maj(a,b,c)
1006	ldr	x19,[x30],#8		// *K++, x28 in next round
1007	add	x3,x3,x9
1008	add	x20,x20,x17			// h+=Sigma0(a)
1009	add	x3,x3,x8
1010	cbnz	x19,Loop_16_xx
1011
1012	ldp	x0,x2,[x29,#96]
1013	ldr	x1,[x29,#112]
1014	sub	x30,x30,#648		// rewind
1015
1016	ldp	x3,x4,[x0]
1017	ldp	x5,x6,[x0,#2*8]
1018	add	x1,x1,#14*8			// advance input pointer
1019	ldp	x7,x8,[x0,#4*8]
1020	add	x20,x20,x3
1021	ldp	x9,x10,[x0,#6*8]
1022	add	x21,x21,x4
1023	add	x22,x22,x5
1024	add	x23,x23,x6
1025	stp	x20,x21,[x0]
1026	add	x24,x24,x7
1027	add	x25,x25,x8
1028	stp	x22,x23,[x0,#2*8]
1029	add	x26,x26,x9
1030	add	x27,x27,x10
1031	cmp	x1,x2
1032	stp	x24,x25,[x0,#4*8]
1033	stp	x26,x27,[x0,#6*8]
1034	b.ne	Loop
1035
1036	ldp	x19,x20,[x29,#16]
1037	add	sp,sp,#4*8
1038	ldp	x21,x22,[x29,#32]
1039	ldp	x23,x24,[x29,#48]
1040	ldp	x25,x26,[x29,#64]
1041	ldp	x27,x28,[x29,#80]
1042	ldp	x29,x30,[sp],#128
1043	AARCH64_VALIDATE_LINK_REGISTER
1044	ret
1045
1046
1047.section	.rodata
1048.align	6
1049
1050LK512:
1051.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1052.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1053.quad	0x3956c25bf348b538,0x59f111f1b605d019
1054.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1055.quad	0xd807aa98a3030242,0x12835b0145706fbe
1056.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1057.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1058.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1059.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1060.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1061.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1062.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1063.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1064.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1065.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1066.quad	0x06ca6351e003826f,0x142929670a0e6e70
1067.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1068.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1069.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1070.quad	0x81c2c92e47edaee6,0x92722c851482353b
1071.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1072.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1073.quad	0xd192e819d6ef5218,0xd69906245565a910
1074.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1075.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1076.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1077.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1078.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1079.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1080.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1081.quad	0x90befffa23631e28,0xa4506cebde82bde9
1082.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1083.quad	0xca273eceea26619c,0xd186b8c721c0c207
1084.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1085.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1086.quad	0x113f9804bef90dae,0x1b710b35131c471b
1087.quad	0x28db77f523047d84,0x32caab7b40c72493
1088.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1089.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1090.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1091.quad	0	// terminator
1092
1093.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1094.align	2
1095.align	2
1096.text
1097#ifndef	__KERNEL__
1098.def sha512_block_armv8
1099   .type 32
1100.endef
1101.align	6
1102sha512_block_armv8:
1103Lv8_entry:
1104	stp	x29,x30,[sp,#-16]!
1105	add	x29,sp,#0
1106
1107	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1108	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1109
1110	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1111	adrp	x3,LK512
1112	add	x3,x3,:lo12:LK512
1113
1114	rev64	v16.16b,v16.16b
1115	rev64	v17.16b,v17.16b
1116	rev64	v18.16b,v18.16b
1117	rev64	v19.16b,v19.16b
1118	rev64	v20.16b,v20.16b
1119	rev64	v21.16b,v21.16b
1120	rev64	v22.16b,v22.16b
1121	rev64	v23.16b,v23.16b
1122	b	Loop_hw
1123
1124.align	4
1125Loop_hw:
1126	ld1	{v24.2d},[x3],#16
1127	subs	x2,x2,#1
1128	sub	x4,x1,#128
1129	orr	v26.16b,v0.16b,v0.16b			// offload
1130	orr	v27.16b,v1.16b,v1.16b
1131	orr	v28.16b,v2.16b,v2.16b
1132	orr	v29.16b,v3.16b,v3.16b
1133	csel	x1,x1,x4,ne			// conditional rewind
1134	add	v24.2d,v24.2d,v16.2d
1135	ld1	{v25.2d},[x3],#16
1136	ext	v24.16b,v24.16b,v24.16b,#8
1137	ext	v5.16b,v2.16b,v3.16b,#8
1138	ext	v6.16b,v1.16b,v2.16b,#8
1139	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1140.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1141	ext	v7.16b,v20.16b,v21.16b,#8
1142.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1143.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1144	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1145.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1146	add	v25.2d,v25.2d,v17.2d
1147	ld1	{v24.2d},[x3],#16
1148	ext	v25.16b,v25.16b,v25.16b,#8
1149	ext	v5.16b,v4.16b,v2.16b,#8
1150	ext	v6.16b,v0.16b,v4.16b,#8
1151	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1152.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1153	ext	v7.16b,v21.16b,v22.16b,#8
1154.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1155.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1156	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1157.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1158	add	v24.2d,v24.2d,v18.2d
1159	ld1	{v25.2d},[x3],#16
1160	ext	v24.16b,v24.16b,v24.16b,#8
1161	ext	v5.16b,v1.16b,v4.16b,#8
1162	ext	v6.16b,v3.16b,v1.16b,#8
1163	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1164.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1165	ext	v7.16b,v22.16b,v23.16b,#8
1166.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1167.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1168	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1169.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1170	add	v25.2d,v25.2d,v19.2d
1171	ld1	{v24.2d},[x3],#16
1172	ext	v25.16b,v25.16b,v25.16b,#8
1173	ext	v5.16b,v0.16b,v1.16b,#8
1174	ext	v6.16b,v2.16b,v0.16b,#8
1175	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1176.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1177	ext	v7.16b,v23.16b,v16.16b,#8
1178.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1179.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1180	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1181.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1182	add	v24.2d,v24.2d,v20.2d
1183	ld1	{v25.2d},[x3],#16
1184	ext	v24.16b,v24.16b,v24.16b,#8
1185	ext	v5.16b,v3.16b,v0.16b,#8
1186	ext	v6.16b,v4.16b,v3.16b,#8
1187	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1188.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1189	ext	v7.16b,v16.16b,v17.16b,#8
1190.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1191.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1192	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1193.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1194	add	v25.2d,v25.2d,v21.2d
1195	ld1	{v24.2d},[x3],#16
1196	ext	v25.16b,v25.16b,v25.16b,#8
1197	ext	v5.16b,v2.16b,v3.16b,#8
1198	ext	v6.16b,v1.16b,v2.16b,#8
1199	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1200.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1201	ext	v7.16b,v17.16b,v18.16b,#8
1202.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1203.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1204	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1205.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1206	add	v24.2d,v24.2d,v22.2d
1207	ld1	{v25.2d},[x3],#16
1208	ext	v24.16b,v24.16b,v24.16b,#8
1209	ext	v5.16b,v4.16b,v2.16b,#8
1210	ext	v6.16b,v0.16b,v4.16b,#8
1211	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1212.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1213	ext	v7.16b,v18.16b,v19.16b,#8
1214.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1215.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1216	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1217.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1218	add	v25.2d,v25.2d,v23.2d
1219	ld1	{v24.2d},[x3],#16
1220	ext	v25.16b,v25.16b,v25.16b,#8
1221	ext	v5.16b,v1.16b,v4.16b,#8
1222	ext	v6.16b,v3.16b,v1.16b,#8
1223	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1224.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1225	ext	v7.16b,v19.16b,v20.16b,#8
1226.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1227.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1228	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1229.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1230	add	v24.2d,v24.2d,v16.2d
1231	ld1	{v25.2d},[x3],#16
1232	ext	v24.16b,v24.16b,v24.16b,#8
1233	ext	v5.16b,v0.16b,v1.16b,#8
1234	ext	v6.16b,v2.16b,v0.16b,#8
1235	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1236.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1237	ext	v7.16b,v20.16b,v21.16b,#8
1238.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1239.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1240	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1241.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1242	add	v25.2d,v25.2d,v17.2d
1243	ld1	{v24.2d},[x3],#16
1244	ext	v25.16b,v25.16b,v25.16b,#8
1245	ext	v5.16b,v3.16b,v0.16b,#8
1246	ext	v6.16b,v4.16b,v3.16b,#8
1247	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1248.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1249	ext	v7.16b,v21.16b,v22.16b,#8
1250.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1251.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1252	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1253.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1254	add	v24.2d,v24.2d,v18.2d
1255	ld1	{v25.2d},[x3],#16
1256	ext	v24.16b,v24.16b,v24.16b,#8
1257	ext	v5.16b,v2.16b,v3.16b,#8
1258	ext	v6.16b,v1.16b,v2.16b,#8
1259	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1260.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1261	ext	v7.16b,v22.16b,v23.16b,#8
1262.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1263.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1264	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1265.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1266	add	v25.2d,v25.2d,v19.2d
1267	ld1	{v24.2d},[x3],#16
1268	ext	v25.16b,v25.16b,v25.16b,#8
1269	ext	v5.16b,v4.16b,v2.16b,#8
1270	ext	v6.16b,v0.16b,v4.16b,#8
1271	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1272.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1273	ext	v7.16b,v23.16b,v16.16b,#8
1274.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1275.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1276	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1277.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1278	add	v24.2d,v24.2d,v20.2d
1279	ld1	{v25.2d},[x3],#16
1280	ext	v24.16b,v24.16b,v24.16b,#8
1281	ext	v5.16b,v1.16b,v4.16b,#8
1282	ext	v6.16b,v3.16b,v1.16b,#8
1283	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1284.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1285	ext	v7.16b,v16.16b,v17.16b,#8
1286.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1287.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1288	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1289.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1290	add	v25.2d,v25.2d,v21.2d
1291	ld1	{v24.2d},[x3],#16
1292	ext	v25.16b,v25.16b,v25.16b,#8
1293	ext	v5.16b,v0.16b,v1.16b,#8
1294	ext	v6.16b,v2.16b,v0.16b,#8
1295	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1296.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1297	ext	v7.16b,v17.16b,v18.16b,#8
1298.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1299.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1300	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1301.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1302	add	v24.2d,v24.2d,v22.2d
1303	ld1	{v25.2d},[x3],#16
1304	ext	v24.16b,v24.16b,v24.16b,#8
1305	ext	v5.16b,v3.16b,v0.16b,#8
1306	ext	v6.16b,v4.16b,v3.16b,#8
1307	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1308.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1309	ext	v7.16b,v18.16b,v19.16b,#8
1310.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1311.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1312	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1313.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1314	add	v25.2d,v25.2d,v23.2d
1315	ld1	{v24.2d},[x3],#16
1316	ext	v25.16b,v25.16b,v25.16b,#8
1317	ext	v5.16b,v2.16b,v3.16b,#8
1318	ext	v6.16b,v1.16b,v2.16b,#8
1319	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1320.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1321	ext	v7.16b,v19.16b,v20.16b,#8
1322.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1323.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1324	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1325.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1326	add	v24.2d,v24.2d,v16.2d
1327	ld1	{v25.2d},[x3],#16
1328	ext	v24.16b,v24.16b,v24.16b,#8
1329	ext	v5.16b,v4.16b,v2.16b,#8
1330	ext	v6.16b,v0.16b,v4.16b,#8
1331	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1332.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1333	ext	v7.16b,v20.16b,v21.16b,#8
1334.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1335.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1336	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1337.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1338	add	v25.2d,v25.2d,v17.2d
1339	ld1	{v24.2d},[x3],#16
1340	ext	v25.16b,v25.16b,v25.16b,#8
1341	ext	v5.16b,v1.16b,v4.16b,#8
1342	ext	v6.16b,v3.16b,v1.16b,#8
1343	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1344.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1345	ext	v7.16b,v21.16b,v22.16b,#8
1346.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1347.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1348	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1349.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1350	add	v24.2d,v24.2d,v18.2d
1351	ld1	{v25.2d},[x3],#16
1352	ext	v24.16b,v24.16b,v24.16b,#8
1353	ext	v5.16b,v0.16b,v1.16b,#8
1354	ext	v6.16b,v2.16b,v0.16b,#8
1355	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1356.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1357	ext	v7.16b,v22.16b,v23.16b,#8
1358.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1359.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1360	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1361.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1362	add	v25.2d,v25.2d,v19.2d
1363	ld1	{v24.2d},[x3],#16
1364	ext	v25.16b,v25.16b,v25.16b,#8
1365	ext	v5.16b,v3.16b,v0.16b,#8
1366	ext	v6.16b,v4.16b,v3.16b,#8
1367	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1368.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1369	ext	v7.16b,v23.16b,v16.16b,#8
1370.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1371.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1372	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1373.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1374	add	v24.2d,v24.2d,v20.2d
1375	ld1	{v25.2d},[x3],#16
1376	ext	v24.16b,v24.16b,v24.16b,#8
1377	ext	v5.16b,v2.16b,v3.16b,#8
1378	ext	v6.16b,v1.16b,v2.16b,#8
1379	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1380.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1381	ext	v7.16b,v16.16b,v17.16b,#8
1382.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1383.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1384	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1385.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1386	add	v25.2d,v25.2d,v21.2d
1387	ld1	{v24.2d},[x3],#16
1388	ext	v25.16b,v25.16b,v25.16b,#8
1389	ext	v5.16b,v4.16b,v2.16b,#8
1390	ext	v6.16b,v0.16b,v4.16b,#8
1391	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1392.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1393	ext	v7.16b,v17.16b,v18.16b,#8
1394.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1395.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1396	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1397.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1398	add	v24.2d,v24.2d,v22.2d
1399	ld1	{v25.2d},[x3],#16
1400	ext	v24.16b,v24.16b,v24.16b,#8
1401	ext	v5.16b,v1.16b,v4.16b,#8
1402	ext	v6.16b,v3.16b,v1.16b,#8
1403	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1404.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1405	ext	v7.16b,v18.16b,v19.16b,#8
1406.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1407.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1408	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1409.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1410	add	v25.2d,v25.2d,v23.2d
1411	ld1	{v24.2d},[x3],#16
1412	ext	v25.16b,v25.16b,v25.16b,#8
1413	ext	v5.16b,v0.16b,v1.16b,#8
1414	ext	v6.16b,v2.16b,v0.16b,#8
1415	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1416.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1417	ext	v7.16b,v19.16b,v20.16b,#8
1418.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1419.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1420	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1421.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1422	add	v24.2d,v24.2d,v16.2d
1423	ld1	{v25.2d},[x3],#16
1424	ext	v24.16b,v24.16b,v24.16b,#8
1425	ext	v5.16b,v3.16b,v0.16b,#8
1426	ext	v6.16b,v4.16b,v3.16b,#8
1427	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1428.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1429	ext	v7.16b,v20.16b,v21.16b,#8
1430.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1431.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1432	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1433.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1434	add	v25.2d,v25.2d,v17.2d
1435	ld1	{v24.2d},[x3],#16
1436	ext	v25.16b,v25.16b,v25.16b,#8
1437	ext	v5.16b,v2.16b,v3.16b,#8
1438	ext	v6.16b,v1.16b,v2.16b,#8
1439	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1440.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1441	ext	v7.16b,v21.16b,v22.16b,#8
1442.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1443.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1444	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1445.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1446	add	v24.2d,v24.2d,v18.2d
1447	ld1	{v25.2d},[x3],#16
1448	ext	v24.16b,v24.16b,v24.16b,#8
1449	ext	v5.16b,v4.16b,v2.16b,#8
1450	ext	v6.16b,v0.16b,v4.16b,#8
1451	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1452.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1453	ext	v7.16b,v22.16b,v23.16b,#8
1454.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1455.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1456	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1457.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1458	add	v25.2d,v25.2d,v19.2d
1459	ld1	{v24.2d},[x3],#16
1460	ext	v25.16b,v25.16b,v25.16b,#8
1461	ext	v5.16b,v1.16b,v4.16b,#8
1462	ext	v6.16b,v3.16b,v1.16b,#8
1463	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1464.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1465	ext	v7.16b,v23.16b,v16.16b,#8
1466.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1467.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1468	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1469.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1470	add	v24.2d,v24.2d,v20.2d
1471	ld1	{v25.2d},[x3],#16
1472	ext	v24.16b,v24.16b,v24.16b,#8
1473	ext	v5.16b,v0.16b,v1.16b,#8
1474	ext	v6.16b,v2.16b,v0.16b,#8
1475	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1476.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1477	ext	v7.16b,v16.16b,v17.16b,#8
1478.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1479.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1480	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1481.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1482	add	v25.2d,v25.2d,v21.2d
1483	ld1	{v24.2d},[x3],#16
1484	ext	v25.16b,v25.16b,v25.16b,#8
1485	ext	v5.16b,v3.16b,v0.16b,#8
1486	ext	v6.16b,v4.16b,v3.16b,#8
1487	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1488.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1489	ext	v7.16b,v17.16b,v18.16b,#8
1490.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1491.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1492	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1493.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1494	add	v24.2d,v24.2d,v22.2d
1495	ld1	{v25.2d},[x3],#16
1496	ext	v24.16b,v24.16b,v24.16b,#8
1497	ext	v5.16b,v2.16b,v3.16b,#8
1498	ext	v6.16b,v1.16b,v2.16b,#8
1499	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1500.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1501	ext	v7.16b,v18.16b,v19.16b,#8
1502.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1503.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1504	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1505.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1506	add	v25.2d,v25.2d,v23.2d
1507	ld1	{v24.2d},[x3],#16
1508	ext	v25.16b,v25.16b,v25.16b,#8
1509	ext	v5.16b,v4.16b,v2.16b,#8
1510	ext	v6.16b,v0.16b,v4.16b,#8
1511	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1512.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1513	ext	v7.16b,v19.16b,v20.16b,#8
1514.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1515.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1516	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1517.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1518	ld1	{v25.2d},[x3],#16
1519	add	v24.2d,v24.2d,v16.2d
1520	ld1	{v16.16b},[x1],#16		// load next input
1521	ext	v24.16b,v24.16b,v24.16b,#8
1522	ext	v5.16b,v1.16b,v4.16b,#8
1523	ext	v6.16b,v3.16b,v1.16b,#8
1524	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1525.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1526	rev64	v16.16b,v16.16b
1527	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1528.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1529	ld1	{v24.2d},[x3],#16
1530	add	v25.2d,v25.2d,v17.2d
1531	ld1	{v17.16b},[x1],#16		// load next input
1532	ext	v25.16b,v25.16b,v25.16b,#8
1533	ext	v5.16b,v0.16b,v1.16b,#8
1534	ext	v6.16b,v2.16b,v0.16b,#8
1535	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1536.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1537	rev64	v17.16b,v17.16b
1538	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1539.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1540	ld1	{v25.2d},[x3],#16
1541	add	v24.2d,v24.2d,v18.2d
1542	ld1	{v18.16b},[x1],#16		// load next input
1543	ext	v24.16b,v24.16b,v24.16b,#8
1544	ext	v5.16b,v3.16b,v0.16b,#8
1545	ext	v6.16b,v4.16b,v3.16b,#8
1546	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1547.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1548	rev64	v18.16b,v18.16b
1549	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1550.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1551	ld1	{v24.2d},[x3],#16
1552	add	v25.2d,v25.2d,v19.2d
1553	ld1	{v19.16b},[x1],#16		// load next input
1554	ext	v25.16b,v25.16b,v25.16b,#8
1555	ext	v5.16b,v2.16b,v3.16b,#8
1556	ext	v6.16b,v1.16b,v2.16b,#8
1557	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1558.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1559	rev64	v19.16b,v19.16b
1560	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1561.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1562	ld1	{v25.2d},[x3],#16
1563	add	v24.2d,v24.2d,v20.2d
1564	ld1	{v20.16b},[x1],#16		// load next input
1565	ext	v24.16b,v24.16b,v24.16b,#8
1566	ext	v5.16b,v4.16b,v2.16b,#8
1567	ext	v6.16b,v0.16b,v4.16b,#8
1568	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1569.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1570	rev64	v20.16b,v20.16b
1571	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1572.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1573	ld1	{v24.2d},[x3],#16
1574	add	v25.2d,v25.2d,v21.2d
1575	ld1	{v21.16b},[x1],#16		// load next input
1576	ext	v25.16b,v25.16b,v25.16b,#8
1577	ext	v5.16b,v1.16b,v4.16b,#8
1578	ext	v6.16b,v3.16b,v1.16b,#8
1579	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1580.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1581	rev64	v21.16b,v21.16b
1582	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1583.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1584	ld1	{v25.2d},[x3],#16
1585	add	v24.2d,v24.2d,v22.2d
1586	ld1	{v22.16b},[x1],#16		// load next input
1587	ext	v24.16b,v24.16b,v24.16b,#8
1588	ext	v5.16b,v0.16b,v1.16b,#8
1589	ext	v6.16b,v2.16b,v0.16b,#8
1590	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1591.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1592	rev64	v22.16b,v22.16b
1593	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1594.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1595	sub	x3,x3,#80*8	// rewind
1596	add	v25.2d,v25.2d,v23.2d
1597	ld1	{v23.16b},[x1],#16		// load next input
1598	ext	v25.16b,v25.16b,v25.16b,#8
1599	ext	v5.16b,v3.16b,v0.16b,#8
1600	ext	v6.16b,v4.16b,v3.16b,#8
1601	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1602.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1603	rev64	v23.16b,v23.16b
1604	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1605.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1606	add	v0.2d,v0.2d,v26.2d			// accumulate
1607	add	v1.2d,v1.2d,v27.2d
1608	add	v2.2d,v2.2d,v28.2d
1609	add	v3.2d,v3.2d,v29.2d
1610
1611	cbnz	x2,Loop_hw
1612
1613	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1614
1615	ldr	x29,[sp],#16
1616	ret
1617
1618#endif
1619#endif
1620#endif  // !OPENSSL_NO_ASM
1621