• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
16//
17// Licensed under the OpenSSL license (the "License").  You may not use
18// this file except in compliance with the License.  You can obtain a copy
19// in the file LICENSE in the source distribution or at
20// https://www.openssl.org/source/license.html
21
22// ====================================================================
23// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
24// project. The module is, however, dual licensed under OpenSSL and
25// CRYPTOGAMS licenses depending on where you obtain it. For further
26// details see http://www.openssl.org/~appro/cryptogams/.
27//
28// Permission to use under GPLv2 terms is granted.
29// ====================================================================
30//
31// SHA256/512 for ARMv8.
32//
33// Performance in cycles per processed byte and improvement coefficient
34// over code generated with "default" compiler:
35//
36//		SHA256-hw	SHA256(*)	SHA512
37// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
38// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
39// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
40// Denver	2.01		10.5 (+26%)	6.70 (+8%)
41// X-Gene			20.0 (+100%)	12.8 (+300%(***))
42// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
43// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
44//
45// (*)	Software SHA256 results are of lesser relevance, presented
46//	mostly for informational purposes.
47// (**)	The result is a trade-off: it's possible to improve it by
48//	10% (or by 1 cycle per round), but at the cost of 20% loss
49//	on Cortex-A53 (or by 4 cycles per round).
50// (***)	Super-impressive coefficients over gcc-generated code are
51//	indication of some compiler "pathology", most notably code
52//	generated with -mgeneral-regs-only is significantly faster
53//	and the gap is only 40-90%.
54
55#ifndef	__KERNEL__
56# include <openssl/arm_arch.h>
57#endif
58
59.text
60
61
62.private_extern	_OPENSSL_armcap_P
63.globl	_sha512_block_data_order
64.private_extern	_sha512_block_data_order
65
66.align	6
67_sha512_block_data_order:
68	AARCH64_VALID_CALL_TARGET
69#ifndef	__KERNEL__
70#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
71	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
72#else
73	adrp	x16,_OPENSSL_armcap_P@PAGE
74#endif
75	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
76	tst	w16,#ARMV8_SHA512
77	b.ne	Lv8_entry
78#endif
79	AARCH64_SIGN_LINK_REGISTER
80	stp	x29,x30,[sp,#-128]!
81	add	x29,sp,#0
82
83	stp	x19,x20,[sp,#16]
84	stp	x21,x22,[sp,#32]
85	stp	x23,x24,[sp,#48]
86	stp	x25,x26,[sp,#64]
87	stp	x27,x28,[sp,#80]
88	sub	sp,sp,#4*8
89
90	ldp	x20,x21,[x0]				// load context
91	ldp	x22,x23,[x0,#2*8]
92	ldp	x24,x25,[x0,#4*8]
93	add	x2,x1,x2,lsl#7	// end of input
94	ldp	x26,x27,[x0,#6*8]
95	adrp	x30,LK512@PAGE
96	add	x30,x30,LK512@PAGEOFF
97	stp	x0,x2,[x29,#96]
98
99Loop:
100	ldp	x3,x4,[x1],#2*8
101	ldr	x19,[x30],#8			// *K++
102	eor	x28,x21,x22				// magic seed
103	str	x1,[x29,#112]
104#ifndef	__AARCH64EB__
105	rev	x3,x3			// 0
106#endif
107	ror	x16,x24,#14
108	add	x27,x27,x19			// h+=K[i]
109	eor	x6,x24,x24,ror#23
110	and	x17,x25,x24
111	bic	x19,x26,x24
112	add	x27,x27,x3			// h+=X[i]
113	orr	x17,x17,x19			// Ch(e,f,g)
114	eor	x19,x20,x21			// a^b, b^c in next round
115	eor	x16,x16,x6,ror#18	// Sigma1(e)
116	ror	x6,x20,#28
117	add	x27,x27,x17			// h+=Ch(e,f,g)
118	eor	x17,x20,x20,ror#5
119	add	x27,x27,x16			// h+=Sigma1(e)
120	and	x28,x28,x19			// (b^c)&=(a^b)
121	add	x23,x23,x27			// d+=h
122	eor	x28,x28,x21			// Maj(a,b,c)
123	eor	x17,x6,x17,ror#34	// Sigma0(a)
124	add	x27,x27,x28			// h+=Maj(a,b,c)
125	ldr	x28,[x30],#8		// *K++, x19 in next round
126	//add	x27,x27,x17			// h+=Sigma0(a)
127#ifndef	__AARCH64EB__
128	rev	x4,x4			// 1
129#endif
130	ldp	x5,x6,[x1],#2*8
131	add	x27,x27,x17			// h+=Sigma0(a)
132	ror	x16,x23,#14
133	add	x26,x26,x28			// h+=K[i]
134	eor	x7,x23,x23,ror#23
135	and	x17,x24,x23
136	bic	x28,x25,x23
137	add	x26,x26,x4			// h+=X[i]
138	orr	x17,x17,x28			// Ch(e,f,g)
139	eor	x28,x27,x20			// a^b, b^c in next round
140	eor	x16,x16,x7,ror#18	// Sigma1(e)
141	ror	x7,x27,#28
142	add	x26,x26,x17			// h+=Ch(e,f,g)
143	eor	x17,x27,x27,ror#5
144	add	x26,x26,x16			// h+=Sigma1(e)
145	and	x19,x19,x28			// (b^c)&=(a^b)
146	add	x22,x22,x26			// d+=h
147	eor	x19,x19,x20			// Maj(a,b,c)
148	eor	x17,x7,x17,ror#34	// Sigma0(a)
149	add	x26,x26,x19			// h+=Maj(a,b,c)
150	ldr	x19,[x30],#8		// *K++, x28 in next round
151	//add	x26,x26,x17			// h+=Sigma0(a)
152#ifndef	__AARCH64EB__
153	rev	x5,x5			// 2
154#endif
155	add	x26,x26,x17			// h+=Sigma0(a)
156	ror	x16,x22,#14
157	add	x25,x25,x19			// h+=K[i]
158	eor	x8,x22,x22,ror#23
159	and	x17,x23,x22
160	bic	x19,x24,x22
161	add	x25,x25,x5			// h+=X[i]
162	orr	x17,x17,x19			// Ch(e,f,g)
163	eor	x19,x26,x27			// a^b, b^c in next round
164	eor	x16,x16,x8,ror#18	// Sigma1(e)
165	ror	x8,x26,#28
166	add	x25,x25,x17			// h+=Ch(e,f,g)
167	eor	x17,x26,x26,ror#5
168	add	x25,x25,x16			// h+=Sigma1(e)
169	and	x28,x28,x19			// (b^c)&=(a^b)
170	add	x21,x21,x25			// d+=h
171	eor	x28,x28,x27			// Maj(a,b,c)
172	eor	x17,x8,x17,ror#34	// Sigma0(a)
173	add	x25,x25,x28			// h+=Maj(a,b,c)
174	ldr	x28,[x30],#8		// *K++, x19 in next round
175	//add	x25,x25,x17			// h+=Sigma0(a)
176#ifndef	__AARCH64EB__
177	rev	x6,x6			// 3
178#endif
179	ldp	x7,x8,[x1],#2*8
180	add	x25,x25,x17			// h+=Sigma0(a)
181	ror	x16,x21,#14
182	add	x24,x24,x28			// h+=K[i]
183	eor	x9,x21,x21,ror#23
184	and	x17,x22,x21
185	bic	x28,x23,x21
186	add	x24,x24,x6			// h+=X[i]
187	orr	x17,x17,x28			// Ch(e,f,g)
188	eor	x28,x25,x26			// a^b, b^c in next round
189	eor	x16,x16,x9,ror#18	// Sigma1(e)
190	ror	x9,x25,#28
191	add	x24,x24,x17			// h+=Ch(e,f,g)
192	eor	x17,x25,x25,ror#5
193	add	x24,x24,x16			// h+=Sigma1(e)
194	and	x19,x19,x28			// (b^c)&=(a^b)
195	add	x20,x20,x24			// d+=h
196	eor	x19,x19,x26			// Maj(a,b,c)
197	eor	x17,x9,x17,ror#34	// Sigma0(a)
198	add	x24,x24,x19			// h+=Maj(a,b,c)
199	ldr	x19,[x30],#8		// *K++, x28 in next round
200	//add	x24,x24,x17			// h+=Sigma0(a)
201#ifndef	__AARCH64EB__
202	rev	x7,x7			// 4
203#endif
204	add	x24,x24,x17			// h+=Sigma0(a)
205	ror	x16,x20,#14
206	add	x23,x23,x19			// h+=K[i]
207	eor	x10,x20,x20,ror#23
208	and	x17,x21,x20
209	bic	x19,x22,x20
210	add	x23,x23,x7			// h+=X[i]
211	orr	x17,x17,x19			// Ch(e,f,g)
212	eor	x19,x24,x25			// a^b, b^c in next round
213	eor	x16,x16,x10,ror#18	// Sigma1(e)
214	ror	x10,x24,#28
215	add	x23,x23,x17			// h+=Ch(e,f,g)
216	eor	x17,x24,x24,ror#5
217	add	x23,x23,x16			// h+=Sigma1(e)
218	and	x28,x28,x19			// (b^c)&=(a^b)
219	add	x27,x27,x23			// d+=h
220	eor	x28,x28,x25			// Maj(a,b,c)
221	eor	x17,x10,x17,ror#34	// Sigma0(a)
222	add	x23,x23,x28			// h+=Maj(a,b,c)
223	ldr	x28,[x30],#8		// *K++, x19 in next round
224	//add	x23,x23,x17			// h+=Sigma0(a)
225#ifndef	__AARCH64EB__
226	rev	x8,x8			// 5
227#endif
228	ldp	x9,x10,[x1],#2*8
229	add	x23,x23,x17			// h+=Sigma0(a)
230	ror	x16,x27,#14
231	add	x22,x22,x28			// h+=K[i]
232	eor	x11,x27,x27,ror#23
233	and	x17,x20,x27
234	bic	x28,x21,x27
235	add	x22,x22,x8			// h+=X[i]
236	orr	x17,x17,x28			// Ch(e,f,g)
237	eor	x28,x23,x24			// a^b, b^c in next round
238	eor	x16,x16,x11,ror#18	// Sigma1(e)
239	ror	x11,x23,#28
240	add	x22,x22,x17			// h+=Ch(e,f,g)
241	eor	x17,x23,x23,ror#5
242	add	x22,x22,x16			// h+=Sigma1(e)
243	and	x19,x19,x28			// (b^c)&=(a^b)
244	add	x26,x26,x22			// d+=h
245	eor	x19,x19,x24			// Maj(a,b,c)
246	eor	x17,x11,x17,ror#34	// Sigma0(a)
247	add	x22,x22,x19			// h+=Maj(a,b,c)
248	ldr	x19,[x30],#8		// *K++, x28 in next round
249	//add	x22,x22,x17			// h+=Sigma0(a)
250#ifndef	__AARCH64EB__
251	rev	x9,x9			// 6
252#endif
253	add	x22,x22,x17			// h+=Sigma0(a)
254	ror	x16,x26,#14
255	add	x21,x21,x19			// h+=K[i]
256	eor	x12,x26,x26,ror#23
257	and	x17,x27,x26
258	bic	x19,x20,x26
259	add	x21,x21,x9			// h+=X[i]
260	orr	x17,x17,x19			// Ch(e,f,g)
261	eor	x19,x22,x23			// a^b, b^c in next round
262	eor	x16,x16,x12,ror#18	// Sigma1(e)
263	ror	x12,x22,#28
264	add	x21,x21,x17			// h+=Ch(e,f,g)
265	eor	x17,x22,x22,ror#5
266	add	x21,x21,x16			// h+=Sigma1(e)
267	and	x28,x28,x19			// (b^c)&=(a^b)
268	add	x25,x25,x21			// d+=h
269	eor	x28,x28,x23			// Maj(a,b,c)
270	eor	x17,x12,x17,ror#34	// Sigma0(a)
271	add	x21,x21,x28			// h+=Maj(a,b,c)
272	ldr	x28,[x30],#8		// *K++, x19 in next round
273	//add	x21,x21,x17			// h+=Sigma0(a)
274#ifndef	__AARCH64EB__
275	rev	x10,x10			// 7
276#endif
277	ldp	x11,x12,[x1],#2*8
278	add	x21,x21,x17			// h+=Sigma0(a)
279	ror	x16,x25,#14
280	add	x20,x20,x28			// h+=K[i]
281	eor	x13,x25,x25,ror#23
282	and	x17,x26,x25
283	bic	x28,x27,x25
284	add	x20,x20,x10			// h+=X[i]
285	orr	x17,x17,x28			// Ch(e,f,g)
286	eor	x28,x21,x22			// a^b, b^c in next round
287	eor	x16,x16,x13,ror#18	// Sigma1(e)
288	ror	x13,x21,#28
289	add	x20,x20,x17			// h+=Ch(e,f,g)
290	eor	x17,x21,x21,ror#5
291	add	x20,x20,x16			// h+=Sigma1(e)
292	and	x19,x19,x28			// (b^c)&=(a^b)
293	add	x24,x24,x20			// d+=h
294	eor	x19,x19,x22			// Maj(a,b,c)
295	eor	x17,x13,x17,ror#34	// Sigma0(a)
296	add	x20,x20,x19			// h+=Maj(a,b,c)
297	ldr	x19,[x30],#8		// *K++, x28 in next round
298	//add	x20,x20,x17			// h+=Sigma0(a)
299#ifndef	__AARCH64EB__
300	rev	x11,x11			// 8
301#endif
302	add	x20,x20,x17			// h+=Sigma0(a)
303	ror	x16,x24,#14
304	add	x27,x27,x19			// h+=K[i]
305	eor	x14,x24,x24,ror#23
306	and	x17,x25,x24
307	bic	x19,x26,x24
308	add	x27,x27,x11			// h+=X[i]
309	orr	x17,x17,x19			// Ch(e,f,g)
310	eor	x19,x20,x21			// a^b, b^c in next round
311	eor	x16,x16,x14,ror#18	// Sigma1(e)
312	ror	x14,x20,#28
313	add	x27,x27,x17			// h+=Ch(e,f,g)
314	eor	x17,x20,x20,ror#5
315	add	x27,x27,x16			// h+=Sigma1(e)
316	and	x28,x28,x19			// (b^c)&=(a^b)
317	add	x23,x23,x27			// d+=h
318	eor	x28,x28,x21			// Maj(a,b,c)
319	eor	x17,x14,x17,ror#34	// Sigma0(a)
320	add	x27,x27,x28			// h+=Maj(a,b,c)
321	ldr	x28,[x30],#8		// *K++, x19 in next round
322	//add	x27,x27,x17			// h+=Sigma0(a)
323#ifndef	__AARCH64EB__
324	rev	x12,x12			// 9
325#endif
326	ldp	x13,x14,[x1],#2*8
327	add	x27,x27,x17			// h+=Sigma0(a)
328	ror	x16,x23,#14
329	add	x26,x26,x28			// h+=K[i]
330	eor	x15,x23,x23,ror#23
331	and	x17,x24,x23
332	bic	x28,x25,x23
333	add	x26,x26,x12			// h+=X[i]
334	orr	x17,x17,x28			// Ch(e,f,g)
335	eor	x28,x27,x20			// a^b, b^c in next round
336	eor	x16,x16,x15,ror#18	// Sigma1(e)
337	ror	x15,x27,#28
338	add	x26,x26,x17			// h+=Ch(e,f,g)
339	eor	x17,x27,x27,ror#5
340	add	x26,x26,x16			// h+=Sigma1(e)
341	and	x19,x19,x28			// (b^c)&=(a^b)
342	add	x22,x22,x26			// d+=h
343	eor	x19,x19,x20			// Maj(a,b,c)
344	eor	x17,x15,x17,ror#34	// Sigma0(a)
345	add	x26,x26,x19			// h+=Maj(a,b,c)
346	ldr	x19,[x30],#8		// *K++, x28 in next round
347	//add	x26,x26,x17			// h+=Sigma0(a)
348#ifndef	__AARCH64EB__
349	rev	x13,x13			// 10
350#endif
351	add	x26,x26,x17			// h+=Sigma0(a)
352	ror	x16,x22,#14
353	add	x25,x25,x19			// h+=K[i]
354	eor	x0,x22,x22,ror#23
355	and	x17,x23,x22
356	bic	x19,x24,x22
357	add	x25,x25,x13			// h+=X[i]
358	orr	x17,x17,x19			// Ch(e,f,g)
359	eor	x19,x26,x27			// a^b, b^c in next round
360	eor	x16,x16,x0,ror#18	// Sigma1(e)
361	ror	x0,x26,#28
362	add	x25,x25,x17			// h+=Ch(e,f,g)
363	eor	x17,x26,x26,ror#5
364	add	x25,x25,x16			// h+=Sigma1(e)
365	and	x28,x28,x19			// (b^c)&=(a^b)
366	add	x21,x21,x25			// d+=h
367	eor	x28,x28,x27			// Maj(a,b,c)
368	eor	x17,x0,x17,ror#34	// Sigma0(a)
369	add	x25,x25,x28			// h+=Maj(a,b,c)
370	ldr	x28,[x30],#8		// *K++, x19 in next round
371	//add	x25,x25,x17			// h+=Sigma0(a)
372#ifndef	__AARCH64EB__
373	rev	x14,x14			// 11
374#endif
375	ldp	x15,x0,[x1],#2*8
376	add	x25,x25,x17			// h+=Sigma0(a)
377	str	x6,[sp,#24]
378	ror	x16,x21,#14
379	add	x24,x24,x28			// h+=K[i]
380	eor	x6,x21,x21,ror#23
381	and	x17,x22,x21
382	bic	x28,x23,x21
383	add	x24,x24,x14			// h+=X[i]
384	orr	x17,x17,x28			// Ch(e,f,g)
385	eor	x28,x25,x26			// a^b, b^c in next round
386	eor	x16,x16,x6,ror#18	// Sigma1(e)
387	ror	x6,x25,#28
388	add	x24,x24,x17			// h+=Ch(e,f,g)
389	eor	x17,x25,x25,ror#5
390	add	x24,x24,x16			// h+=Sigma1(e)
391	and	x19,x19,x28			// (b^c)&=(a^b)
392	add	x20,x20,x24			// d+=h
393	eor	x19,x19,x26			// Maj(a,b,c)
394	eor	x17,x6,x17,ror#34	// Sigma0(a)
395	add	x24,x24,x19			// h+=Maj(a,b,c)
396	ldr	x19,[x30],#8		// *K++, x28 in next round
397	//add	x24,x24,x17			// h+=Sigma0(a)
398#ifndef	__AARCH64EB__
399	rev	x15,x15			// 12
400#endif
401	add	x24,x24,x17			// h+=Sigma0(a)
402	str	x7,[sp,#0]
403	ror	x16,x20,#14
404	add	x23,x23,x19			// h+=K[i]
405	eor	x7,x20,x20,ror#23
406	and	x17,x21,x20
407	bic	x19,x22,x20
408	add	x23,x23,x15			// h+=X[i]
409	orr	x17,x17,x19			// Ch(e,f,g)
410	eor	x19,x24,x25			// a^b, b^c in next round
411	eor	x16,x16,x7,ror#18	// Sigma1(e)
412	ror	x7,x24,#28
413	add	x23,x23,x17			// h+=Ch(e,f,g)
414	eor	x17,x24,x24,ror#5
415	add	x23,x23,x16			// h+=Sigma1(e)
416	and	x28,x28,x19			// (b^c)&=(a^b)
417	add	x27,x27,x23			// d+=h
418	eor	x28,x28,x25			// Maj(a,b,c)
419	eor	x17,x7,x17,ror#34	// Sigma0(a)
420	add	x23,x23,x28			// h+=Maj(a,b,c)
421	ldr	x28,[x30],#8		// *K++, x19 in next round
422	//add	x23,x23,x17			// h+=Sigma0(a)
423#ifndef	__AARCH64EB__
424	rev	x0,x0			// 13
425#endif
426	ldp	x1,x2,[x1]
427	add	x23,x23,x17			// h+=Sigma0(a)
428	str	x8,[sp,#8]
429	ror	x16,x27,#14
430	add	x22,x22,x28			// h+=K[i]
431	eor	x8,x27,x27,ror#23
432	and	x17,x20,x27
433	bic	x28,x21,x27
434	add	x22,x22,x0			// h+=X[i]
435	orr	x17,x17,x28			// Ch(e,f,g)
436	eor	x28,x23,x24			// a^b, b^c in next round
437	eor	x16,x16,x8,ror#18	// Sigma1(e)
438	ror	x8,x23,#28
439	add	x22,x22,x17			// h+=Ch(e,f,g)
440	eor	x17,x23,x23,ror#5
441	add	x22,x22,x16			// h+=Sigma1(e)
442	and	x19,x19,x28			// (b^c)&=(a^b)
443	add	x26,x26,x22			// d+=h
444	eor	x19,x19,x24			// Maj(a,b,c)
445	eor	x17,x8,x17,ror#34	// Sigma0(a)
446	add	x22,x22,x19			// h+=Maj(a,b,c)
447	ldr	x19,[x30],#8		// *K++, x28 in next round
448	//add	x22,x22,x17			// h+=Sigma0(a)
449#ifndef	__AARCH64EB__
450	rev	x1,x1			// 14
451#endif
452	ldr	x6,[sp,#24]
453	add	x22,x22,x17			// h+=Sigma0(a)
454	str	x9,[sp,#16]
455	ror	x16,x26,#14
456	add	x21,x21,x19			// h+=K[i]
457	eor	x9,x26,x26,ror#23
458	and	x17,x27,x26
459	bic	x19,x20,x26
460	add	x21,x21,x1			// h+=X[i]
461	orr	x17,x17,x19			// Ch(e,f,g)
462	eor	x19,x22,x23			// a^b, b^c in next round
463	eor	x16,x16,x9,ror#18	// Sigma1(e)
464	ror	x9,x22,#28
465	add	x21,x21,x17			// h+=Ch(e,f,g)
466	eor	x17,x22,x22,ror#5
467	add	x21,x21,x16			// h+=Sigma1(e)
468	and	x28,x28,x19			// (b^c)&=(a^b)
469	add	x25,x25,x21			// d+=h
470	eor	x28,x28,x23			// Maj(a,b,c)
471	eor	x17,x9,x17,ror#34	// Sigma0(a)
472	add	x21,x21,x28			// h+=Maj(a,b,c)
473	ldr	x28,[x30],#8		// *K++, x19 in next round
474	//add	x21,x21,x17			// h+=Sigma0(a)
475#ifndef	__AARCH64EB__
476	rev	x2,x2			// 15
477#endif
478	ldr	x7,[sp,#0]
479	add	x21,x21,x17			// h+=Sigma0(a)
480	str	x10,[sp,#24]
481	ror	x16,x25,#14
482	add	x20,x20,x28			// h+=K[i]
483	ror	x9,x4,#1
484	and	x17,x26,x25
485	ror	x8,x1,#19
486	bic	x28,x27,x25
487	ror	x10,x21,#28
488	add	x20,x20,x2			// h+=X[i]
489	eor	x16,x16,x25,ror#18
490	eor	x9,x9,x4,ror#8
491	orr	x17,x17,x28			// Ch(e,f,g)
492	eor	x28,x21,x22			// a^b, b^c in next round
493	eor	x16,x16,x25,ror#41	// Sigma1(e)
494	eor	x10,x10,x21,ror#34
495	add	x20,x20,x17			// h+=Ch(e,f,g)
496	and	x19,x19,x28			// (b^c)&=(a^b)
497	eor	x8,x8,x1,ror#61
498	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
499	add	x20,x20,x16			// h+=Sigma1(e)
500	eor	x19,x19,x22			// Maj(a,b,c)
501	eor	x17,x10,x21,ror#39	// Sigma0(a)
502	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
503	add	x3,x3,x12
504	add	x24,x24,x20			// d+=h
505	add	x20,x20,x19			// h+=Maj(a,b,c)
506	ldr	x19,[x30],#8		// *K++, x28 in next round
507	add	x3,x3,x9
508	add	x20,x20,x17			// h+=Sigma0(a)
509	add	x3,x3,x8
510Loop_16_xx:
511	ldr	x8,[sp,#8]
512	str	x11,[sp,#0]
513	ror	x16,x24,#14
514	add	x27,x27,x19			// h+=K[i]
515	ror	x10,x5,#1
516	and	x17,x25,x24
517	ror	x9,x2,#19
518	bic	x19,x26,x24
519	ror	x11,x20,#28
520	add	x27,x27,x3			// h+=X[i]
521	eor	x16,x16,x24,ror#18
522	eor	x10,x10,x5,ror#8
523	orr	x17,x17,x19			// Ch(e,f,g)
524	eor	x19,x20,x21			// a^b, b^c in next round
525	eor	x16,x16,x24,ror#41	// Sigma1(e)
526	eor	x11,x11,x20,ror#34
527	add	x27,x27,x17			// h+=Ch(e,f,g)
528	and	x28,x28,x19			// (b^c)&=(a^b)
529	eor	x9,x9,x2,ror#61
530	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
531	add	x27,x27,x16			// h+=Sigma1(e)
532	eor	x28,x28,x21			// Maj(a,b,c)
533	eor	x17,x11,x20,ror#39	// Sigma0(a)
534	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
535	add	x4,x4,x13
536	add	x23,x23,x27			// d+=h
537	add	x27,x27,x28			// h+=Maj(a,b,c)
538	ldr	x28,[x30],#8		// *K++, x19 in next round
539	add	x4,x4,x10
540	add	x27,x27,x17			// h+=Sigma0(a)
541	add	x4,x4,x9
542	ldr	x9,[sp,#16]
543	str	x12,[sp,#8]
544	ror	x16,x23,#14
545	add	x26,x26,x28			// h+=K[i]
546	ror	x11,x6,#1
547	and	x17,x24,x23
548	ror	x10,x3,#19
549	bic	x28,x25,x23
550	ror	x12,x27,#28
551	add	x26,x26,x4			// h+=X[i]
552	eor	x16,x16,x23,ror#18
553	eor	x11,x11,x6,ror#8
554	orr	x17,x17,x28			// Ch(e,f,g)
555	eor	x28,x27,x20			// a^b, b^c in next round
556	eor	x16,x16,x23,ror#41	// Sigma1(e)
557	eor	x12,x12,x27,ror#34
558	add	x26,x26,x17			// h+=Ch(e,f,g)
559	and	x19,x19,x28			// (b^c)&=(a^b)
560	eor	x10,x10,x3,ror#61
561	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
562	add	x26,x26,x16			// h+=Sigma1(e)
563	eor	x19,x19,x20			// Maj(a,b,c)
564	eor	x17,x12,x27,ror#39	// Sigma0(a)
565	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
566	add	x5,x5,x14
567	add	x22,x22,x26			// d+=h
568	add	x26,x26,x19			// h+=Maj(a,b,c)
569	ldr	x19,[x30],#8		// *K++, x28 in next round
570	add	x5,x5,x11
571	add	x26,x26,x17			// h+=Sigma0(a)
572	add	x5,x5,x10
573	ldr	x10,[sp,#24]
574	str	x13,[sp,#16]
575	ror	x16,x22,#14
576	add	x25,x25,x19			// h+=K[i]
577	ror	x12,x7,#1
578	and	x17,x23,x22
579	ror	x11,x4,#19
580	bic	x19,x24,x22
581	ror	x13,x26,#28
582	add	x25,x25,x5			// h+=X[i]
583	eor	x16,x16,x22,ror#18
584	eor	x12,x12,x7,ror#8
585	orr	x17,x17,x19			// Ch(e,f,g)
586	eor	x19,x26,x27			// a^b, b^c in next round
587	eor	x16,x16,x22,ror#41	// Sigma1(e)
588	eor	x13,x13,x26,ror#34
589	add	x25,x25,x17			// h+=Ch(e,f,g)
590	and	x28,x28,x19			// (b^c)&=(a^b)
591	eor	x11,x11,x4,ror#61
592	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
593	add	x25,x25,x16			// h+=Sigma1(e)
594	eor	x28,x28,x27			// Maj(a,b,c)
595	eor	x17,x13,x26,ror#39	// Sigma0(a)
596	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
597	add	x6,x6,x15
598	add	x21,x21,x25			// d+=h
599	add	x25,x25,x28			// h+=Maj(a,b,c)
600	ldr	x28,[x30],#8		// *K++, x19 in next round
601	add	x6,x6,x12
602	add	x25,x25,x17			// h+=Sigma0(a)
603	add	x6,x6,x11
604	ldr	x11,[sp,#0]
605	str	x14,[sp,#24]
606	ror	x16,x21,#14
607	add	x24,x24,x28			// h+=K[i]
608	ror	x13,x8,#1
609	and	x17,x22,x21
610	ror	x12,x5,#19
611	bic	x28,x23,x21
612	ror	x14,x25,#28
613	add	x24,x24,x6			// h+=X[i]
614	eor	x16,x16,x21,ror#18
615	eor	x13,x13,x8,ror#8
616	orr	x17,x17,x28			// Ch(e,f,g)
617	eor	x28,x25,x26			// a^b, b^c in next round
618	eor	x16,x16,x21,ror#41	// Sigma1(e)
619	eor	x14,x14,x25,ror#34
620	add	x24,x24,x17			// h+=Ch(e,f,g)
621	and	x19,x19,x28			// (b^c)&=(a^b)
622	eor	x12,x12,x5,ror#61
623	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
624	add	x24,x24,x16			// h+=Sigma1(e)
625	eor	x19,x19,x26			// Maj(a,b,c)
626	eor	x17,x14,x25,ror#39	// Sigma0(a)
627	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
628	add	x7,x7,x0
629	add	x20,x20,x24			// d+=h
630	add	x24,x24,x19			// h+=Maj(a,b,c)
631	ldr	x19,[x30],#8		// *K++, x28 in next round
632	add	x7,x7,x13
633	add	x24,x24,x17			// h+=Sigma0(a)
634	add	x7,x7,x12
635	ldr	x12,[sp,#8]
636	str	x15,[sp,#0]
637	ror	x16,x20,#14
638	add	x23,x23,x19			// h+=K[i]
639	ror	x14,x9,#1
640	and	x17,x21,x20
641	ror	x13,x6,#19
642	bic	x19,x22,x20
643	ror	x15,x24,#28
644	add	x23,x23,x7			// h+=X[i]
645	eor	x16,x16,x20,ror#18
646	eor	x14,x14,x9,ror#8
647	orr	x17,x17,x19			// Ch(e,f,g)
648	eor	x19,x24,x25			// a^b, b^c in next round
649	eor	x16,x16,x20,ror#41	// Sigma1(e)
650	eor	x15,x15,x24,ror#34
651	add	x23,x23,x17			// h+=Ch(e,f,g)
652	and	x28,x28,x19			// (b^c)&=(a^b)
653	eor	x13,x13,x6,ror#61
654	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
655	add	x23,x23,x16			// h+=Sigma1(e)
656	eor	x28,x28,x25			// Maj(a,b,c)
657	eor	x17,x15,x24,ror#39	// Sigma0(a)
658	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
659	add	x8,x8,x1
660	add	x27,x27,x23			// d+=h
661	add	x23,x23,x28			// h+=Maj(a,b,c)
662	ldr	x28,[x30],#8		// *K++, x19 in next round
663	add	x8,x8,x14
664	add	x23,x23,x17			// h+=Sigma0(a)
665	add	x8,x8,x13
666	ldr	x13,[sp,#16]
667	str	x0,[sp,#8]
668	ror	x16,x27,#14
669	add	x22,x22,x28			// h+=K[i]
670	ror	x15,x10,#1
671	and	x17,x20,x27
672	ror	x14,x7,#19
673	bic	x28,x21,x27
674	ror	x0,x23,#28
675	add	x22,x22,x8			// h+=X[i]
676	eor	x16,x16,x27,ror#18
677	eor	x15,x15,x10,ror#8
678	orr	x17,x17,x28			// Ch(e,f,g)
679	eor	x28,x23,x24			// a^b, b^c in next round
680	eor	x16,x16,x27,ror#41	// Sigma1(e)
681	eor	x0,x0,x23,ror#34
682	add	x22,x22,x17			// h+=Ch(e,f,g)
683	and	x19,x19,x28			// (b^c)&=(a^b)
684	eor	x14,x14,x7,ror#61
685	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
686	add	x22,x22,x16			// h+=Sigma1(e)
687	eor	x19,x19,x24			// Maj(a,b,c)
688	eor	x17,x0,x23,ror#39	// Sigma0(a)
689	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
690	add	x9,x9,x2
691	add	x26,x26,x22			// d+=h
692	add	x22,x22,x19			// h+=Maj(a,b,c)
693	ldr	x19,[x30],#8		// *K++, x28 in next round
694	add	x9,x9,x15
695	add	x22,x22,x17			// h+=Sigma0(a)
696	add	x9,x9,x14
697	ldr	x14,[sp,#24]
698	str	x1,[sp,#16]
699	ror	x16,x26,#14
700	add	x21,x21,x19			// h+=K[i]
701	ror	x0,x11,#1
702	and	x17,x27,x26
703	ror	x15,x8,#19
704	bic	x19,x20,x26
705	ror	x1,x22,#28
706	add	x21,x21,x9			// h+=X[i]
707	eor	x16,x16,x26,ror#18
708	eor	x0,x0,x11,ror#8
709	orr	x17,x17,x19			// Ch(e,f,g)
710	eor	x19,x22,x23			// a^b, b^c in next round
711	eor	x16,x16,x26,ror#41	// Sigma1(e)
712	eor	x1,x1,x22,ror#34
713	add	x21,x21,x17			// h+=Ch(e,f,g)
714	and	x28,x28,x19			// (b^c)&=(a^b)
715	eor	x15,x15,x8,ror#61
716	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
717	add	x21,x21,x16			// h+=Sigma1(e)
718	eor	x28,x28,x23			// Maj(a,b,c)
719	eor	x17,x1,x22,ror#39	// Sigma0(a)
720	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
721	add	x10,x10,x3
722	add	x25,x25,x21			// d+=h
723	add	x21,x21,x28			// h+=Maj(a,b,c)
724	ldr	x28,[x30],#8		// *K++, x19 in next round
725	add	x10,x10,x0
726	add	x21,x21,x17			// h+=Sigma0(a)
727	add	x10,x10,x15
728	ldr	x15,[sp,#0]
729	str	x2,[sp,#24]
730	ror	x16,x25,#14
731	add	x20,x20,x28			// h+=K[i]
732	ror	x1,x12,#1
733	and	x17,x26,x25
734	ror	x0,x9,#19
735	bic	x28,x27,x25
736	ror	x2,x21,#28
737	add	x20,x20,x10			// h+=X[i]
738	eor	x16,x16,x25,ror#18
739	eor	x1,x1,x12,ror#8
740	orr	x17,x17,x28			// Ch(e,f,g)
741	eor	x28,x21,x22			// a^b, b^c in next round
742	eor	x16,x16,x25,ror#41	// Sigma1(e)
743	eor	x2,x2,x21,ror#34
744	add	x20,x20,x17			// h+=Ch(e,f,g)
745	and	x19,x19,x28			// (b^c)&=(a^b)
746	eor	x0,x0,x9,ror#61
747	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
748	add	x20,x20,x16			// h+=Sigma1(e)
749	eor	x19,x19,x22			// Maj(a,b,c)
750	eor	x17,x2,x21,ror#39	// Sigma0(a)
751	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
752	add	x11,x11,x4
753	add	x24,x24,x20			// d+=h
754	add	x20,x20,x19			// h+=Maj(a,b,c)
755	ldr	x19,[x30],#8		// *K++, x28 in next round
756	add	x11,x11,x1
757	add	x20,x20,x17			// h+=Sigma0(a)
758	add	x11,x11,x0
759	ldr	x0,[sp,#8]
760	str	x3,[sp,#0]
761	ror	x16,x24,#14
762	add	x27,x27,x19			// h+=K[i]
763	ror	x2,x13,#1
764	and	x17,x25,x24
765	ror	x1,x10,#19
766	bic	x19,x26,x24
767	ror	x3,x20,#28
768	add	x27,x27,x11			// h+=X[i]
769	eor	x16,x16,x24,ror#18
770	eor	x2,x2,x13,ror#8
771	orr	x17,x17,x19			// Ch(e,f,g)
772	eor	x19,x20,x21			// a^b, b^c in next round
773	eor	x16,x16,x24,ror#41	// Sigma1(e)
774	eor	x3,x3,x20,ror#34
775	add	x27,x27,x17			// h+=Ch(e,f,g)
776	and	x28,x28,x19			// (b^c)&=(a^b)
777	eor	x1,x1,x10,ror#61
778	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
779	add	x27,x27,x16			// h+=Sigma1(e)
780	eor	x28,x28,x21			// Maj(a,b,c)
781	eor	x17,x3,x20,ror#39	// Sigma0(a)
782	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
783	add	x12,x12,x5
784	add	x23,x23,x27			// d+=h
785	add	x27,x27,x28			// h+=Maj(a,b,c)
786	ldr	x28,[x30],#8		// *K++, x19 in next round
787	add	x12,x12,x2
788	add	x27,x27,x17			// h+=Sigma0(a)
789	add	x12,x12,x1
790	ldr	x1,[sp,#16]
791	str	x4,[sp,#8]
792	ror	x16,x23,#14
793	add	x26,x26,x28			// h+=K[i]
794	ror	x3,x14,#1
795	and	x17,x24,x23
796	ror	x2,x11,#19
797	bic	x28,x25,x23
798	ror	x4,x27,#28
799	add	x26,x26,x12			// h+=X[i]
800	eor	x16,x16,x23,ror#18
801	eor	x3,x3,x14,ror#8
802	orr	x17,x17,x28			// Ch(e,f,g)
803	eor	x28,x27,x20			// a^b, b^c in next round
804	eor	x16,x16,x23,ror#41	// Sigma1(e)
805	eor	x4,x4,x27,ror#34
806	add	x26,x26,x17			// h+=Ch(e,f,g)
807	and	x19,x19,x28			// (b^c)&=(a^b)
808	eor	x2,x2,x11,ror#61
809	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
810	add	x26,x26,x16			// h+=Sigma1(e)
811	eor	x19,x19,x20			// Maj(a,b,c)
812	eor	x17,x4,x27,ror#39	// Sigma0(a)
813	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
814	add	x13,x13,x6
815	add	x22,x22,x26			// d+=h
816	add	x26,x26,x19			// h+=Maj(a,b,c)
817	ldr	x19,[x30],#8		// *K++, x28 in next round
818	add	x13,x13,x3
819	add	x26,x26,x17			// h+=Sigma0(a)
820	add	x13,x13,x2
821	ldr	x2,[sp,#24]
822	str	x5,[sp,#16]
823	ror	x16,x22,#14
824	add	x25,x25,x19			// h+=K[i]
825	ror	x4,x15,#1
826	and	x17,x23,x22
827	ror	x3,x12,#19
828	bic	x19,x24,x22
829	ror	x5,x26,#28
830	add	x25,x25,x13			// h+=X[i]
831	eor	x16,x16,x22,ror#18
832	eor	x4,x4,x15,ror#8
833	orr	x17,x17,x19			// Ch(e,f,g)
834	eor	x19,x26,x27			// a^b, b^c in next round
835	eor	x16,x16,x22,ror#41	// Sigma1(e)
836	eor	x5,x5,x26,ror#34
837	add	x25,x25,x17			// h+=Ch(e,f,g)
838	and	x28,x28,x19			// (b^c)&=(a^b)
839	eor	x3,x3,x12,ror#61
840	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
841	add	x25,x25,x16			// h+=Sigma1(e)
842	eor	x28,x28,x27			// Maj(a,b,c)
843	eor	x17,x5,x26,ror#39	// Sigma0(a)
844	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
845	add	x14,x14,x7
846	add	x21,x21,x25			// d+=h
847	add	x25,x25,x28			// h+=Maj(a,b,c)
848	ldr	x28,[x30],#8		// *K++, x19 in next round
849	add	x14,x14,x4
850	add	x25,x25,x17			// h+=Sigma0(a)
851	add	x14,x14,x3
852	ldr	x3,[sp,#0]
853	str	x6,[sp,#24]
854	ror	x16,x21,#14
855	add	x24,x24,x28			// h+=K[i]
856	ror	x5,x0,#1
857	and	x17,x22,x21
858	ror	x4,x13,#19
859	bic	x28,x23,x21
860	ror	x6,x25,#28
861	add	x24,x24,x14			// h+=X[i]
862	eor	x16,x16,x21,ror#18
863	eor	x5,x5,x0,ror#8
864	orr	x17,x17,x28			// Ch(e,f,g)
865	eor	x28,x25,x26			// a^b, b^c in next round
866	eor	x16,x16,x21,ror#41	// Sigma1(e)
867	eor	x6,x6,x25,ror#34
868	add	x24,x24,x17			// h+=Ch(e,f,g)
869	and	x19,x19,x28			// (b^c)&=(a^b)
870	eor	x4,x4,x13,ror#61
871	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
872	add	x24,x24,x16			// h+=Sigma1(e)
873	eor	x19,x19,x26			// Maj(a,b,c)
874	eor	x17,x6,x25,ror#39	// Sigma0(a)
875	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
876	add	x15,x15,x8
877	add	x20,x20,x24			// d+=h
878	add	x24,x24,x19			// h+=Maj(a,b,c)
879	ldr	x19,[x30],#8		// *K++, x28 in next round
880	add	x15,x15,x5
881	add	x24,x24,x17			// h+=Sigma0(a)
882	add	x15,x15,x4
883	ldr	x4,[sp,#8]
884	str	x7,[sp,#0]
885	ror	x16,x20,#14
886	add	x23,x23,x19			// h+=K[i]
887	ror	x6,x1,#1
888	and	x17,x21,x20
889	ror	x5,x14,#19
890	bic	x19,x22,x20
891	ror	x7,x24,#28
892	add	x23,x23,x15			// h+=X[i]
893	eor	x16,x16,x20,ror#18
894	eor	x6,x6,x1,ror#8
895	orr	x17,x17,x19			// Ch(e,f,g)
896	eor	x19,x24,x25			// a^b, b^c in next round
897	eor	x16,x16,x20,ror#41	// Sigma1(e)
898	eor	x7,x7,x24,ror#34
899	add	x23,x23,x17			// h+=Ch(e,f,g)
900	and	x28,x28,x19			// (b^c)&=(a^b)
901	eor	x5,x5,x14,ror#61
902	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
903	add	x23,x23,x16			// h+=Sigma1(e)
904	eor	x28,x28,x25			// Maj(a,b,c)
905	eor	x17,x7,x24,ror#39	// Sigma0(a)
906	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
907	add	x0,x0,x9
908	add	x27,x27,x23			// d+=h
909	add	x23,x23,x28			// h+=Maj(a,b,c)
910	ldr	x28,[x30],#8		// *K++, x19 in next round
911	add	x0,x0,x6
912	add	x23,x23,x17			// h+=Sigma0(a)
913	add	x0,x0,x5
914	ldr	x5,[sp,#16]
915	str	x8,[sp,#8]
916	ror	x16,x27,#14
917	add	x22,x22,x28			// h+=K[i]
918	ror	x7,x2,#1
919	and	x17,x20,x27
920	ror	x6,x15,#19
921	bic	x28,x21,x27
922	ror	x8,x23,#28
923	add	x22,x22,x0			// h+=X[i]
924	eor	x16,x16,x27,ror#18
925	eor	x7,x7,x2,ror#8
926	orr	x17,x17,x28			// Ch(e,f,g)
927	eor	x28,x23,x24			// a^b, b^c in next round
928	eor	x16,x16,x27,ror#41	// Sigma1(e)
929	eor	x8,x8,x23,ror#34
930	add	x22,x22,x17			// h+=Ch(e,f,g)
931	and	x19,x19,x28			// (b^c)&=(a^b)
932	eor	x6,x6,x15,ror#61
933	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
934	add	x22,x22,x16			// h+=Sigma1(e)
935	eor	x19,x19,x24			// Maj(a,b,c)
936	eor	x17,x8,x23,ror#39	// Sigma0(a)
937	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
938	add	x1,x1,x10
939	add	x26,x26,x22			// d+=h
940	add	x22,x22,x19			// h+=Maj(a,b,c)
941	ldr	x19,[x30],#8		// *K++, x28 in next round
942	add	x1,x1,x7
943	add	x22,x22,x17			// h+=Sigma0(a)
944	add	x1,x1,x6
945	ldr	x6,[sp,#24]
946	str	x9,[sp,#16]
947	ror	x16,x26,#14
948	add	x21,x21,x19			// h+=K[i]
949	ror	x8,x3,#1
950	and	x17,x27,x26
951	ror	x7,x0,#19
952	bic	x19,x20,x26
953	ror	x9,x22,#28
954	add	x21,x21,x1			// h+=X[i]
955	eor	x16,x16,x26,ror#18
956	eor	x8,x8,x3,ror#8
957	orr	x17,x17,x19			// Ch(e,f,g)
958	eor	x19,x22,x23			// a^b, b^c in next round
959	eor	x16,x16,x26,ror#41	// Sigma1(e)
960	eor	x9,x9,x22,ror#34
961	add	x21,x21,x17			// h+=Ch(e,f,g)
962	and	x28,x28,x19			// (b^c)&=(a^b)
963	eor	x7,x7,x0,ror#61
964	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
965	add	x21,x21,x16			// h+=Sigma1(e)
966	eor	x28,x28,x23			// Maj(a,b,c)
967	eor	x17,x9,x22,ror#39	// Sigma0(a)
968	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
969	add	x2,x2,x11
970	add	x25,x25,x21			// d+=h
971	add	x21,x21,x28			// h+=Maj(a,b,c)
972	ldr	x28,[x30],#8		// *K++, x19 in next round
973	add	x2,x2,x8
974	add	x21,x21,x17			// h+=Sigma0(a)
975	add	x2,x2,x7
976	ldr	x7,[sp,#0]
977	str	x10,[sp,#24]
978	ror	x16,x25,#14
979	add	x20,x20,x28			// h+=K[i]
980	ror	x9,x4,#1
981	and	x17,x26,x25
982	ror	x8,x1,#19
983	bic	x28,x27,x25
984	ror	x10,x21,#28
985	add	x20,x20,x2			// h+=X[i]
986	eor	x16,x16,x25,ror#18
987	eor	x9,x9,x4,ror#8
988	orr	x17,x17,x28			// Ch(e,f,g)
989	eor	x28,x21,x22			// a^b, b^c in next round
990	eor	x16,x16,x25,ror#41	// Sigma1(e)
991	eor	x10,x10,x21,ror#34
992	add	x20,x20,x17			// h+=Ch(e,f,g)
993	and	x19,x19,x28			// (b^c)&=(a^b)
994	eor	x8,x8,x1,ror#61
995	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
996	add	x20,x20,x16			// h+=Sigma1(e)
997	eor	x19,x19,x22			// Maj(a,b,c)
998	eor	x17,x10,x21,ror#39	// Sigma0(a)
999	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1000	add	x3,x3,x12
1001	add	x24,x24,x20			// d+=h
1002	add	x20,x20,x19			// h+=Maj(a,b,c)
1003	ldr	x19,[x30],#8		// *K++, x28 in next round
1004	add	x3,x3,x9
1005	add	x20,x20,x17			// h+=Sigma0(a)
1006	add	x3,x3,x8
1007	cbnz	x19,Loop_16_xx
1008
1009	ldp	x0,x2,[x29,#96]
1010	ldr	x1,[x29,#112]
1011	sub	x30,x30,#648		// rewind
1012
1013	ldp	x3,x4,[x0]
1014	ldp	x5,x6,[x0,#2*8]
1015	add	x1,x1,#14*8			// advance input pointer
1016	ldp	x7,x8,[x0,#4*8]
1017	add	x20,x20,x3
1018	ldp	x9,x10,[x0,#6*8]
1019	add	x21,x21,x4
1020	add	x22,x22,x5
1021	add	x23,x23,x6
1022	stp	x20,x21,[x0]
1023	add	x24,x24,x7
1024	add	x25,x25,x8
1025	stp	x22,x23,[x0,#2*8]
1026	add	x26,x26,x9
1027	add	x27,x27,x10
1028	cmp	x1,x2
1029	stp	x24,x25,[x0,#4*8]
1030	stp	x26,x27,[x0,#6*8]
1031	b.ne	Loop
1032
1033	ldp	x19,x20,[x29,#16]
1034	add	sp,sp,#4*8
1035	ldp	x21,x22,[x29,#32]
1036	ldp	x23,x24,[x29,#48]
1037	ldp	x25,x26,[x29,#64]
1038	ldp	x27,x28,[x29,#80]
1039	ldp	x29,x30,[sp],#128
1040	AARCH64_VALIDATE_LINK_REGISTER
1041	ret
1042
1043
1044.section	__TEXT,__const
1045.align	6
1046
1047LK512:
1048.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1049.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1050.quad	0x3956c25bf348b538,0x59f111f1b605d019
1051.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1052.quad	0xd807aa98a3030242,0x12835b0145706fbe
1053.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1054.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1055.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1056.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1057.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1058.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1059.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1060.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1061.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1062.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1063.quad	0x06ca6351e003826f,0x142929670a0e6e70
1064.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1065.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1066.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1067.quad	0x81c2c92e47edaee6,0x92722c851482353b
1068.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1069.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1070.quad	0xd192e819d6ef5218,0xd69906245565a910
1071.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1072.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1073.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1074.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1075.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1076.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1077.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1078.quad	0x90befffa23631e28,0xa4506cebde82bde9
1079.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1080.quad	0xca273eceea26619c,0xd186b8c721c0c207
1081.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1082.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1083.quad	0x113f9804bef90dae,0x1b710b35131c471b
1084.quad	0x28db77f523047d84,0x32caab7b40c72493
1085.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1086.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1087.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1088.quad	0	// terminator
1089
1090.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1091.align	2
1092.align	2
1093.text
1094#ifndef	__KERNEL__
1095
1096.align	6
1097sha512_block_armv8:
1098Lv8_entry:
1099	stp	x29,x30,[sp,#-16]!
1100	add	x29,sp,#0
1101
1102	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1103	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1104
1105	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1106	adrp	x3,LK512@PAGE
1107	add	x3,x3,LK512@PAGEOFF
1108
1109	rev64	v16.16b,v16.16b
1110	rev64	v17.16b,v17.16b
1111	rev64	v18.16b,v18.16b
1112	rev64	v19.16b,v19.16b
1113	rev64	v20.16b,v20.16b
1114	rev64	v21.16b,v21.16b
1115	rev64	v22.16b,v22.16b
1116	rev64	v23.16b,v23.16b
1117	b	Loop_hw
1118
1119.align	4
1120Loop_hw:
1121	ld1	{v24.2d},[x3],#16
1122	subs	x2,x2,#1
1123	sub	x4,x1,#128
1124	orr	v26.16b,v0.16b,v0.16b			// offload
1125	orr	v27.16b,v1.16b,v1.16b
1126	orr	v28.16b,v2.16b,v2.16b
1127	orr	v29.16b,v3.16b,v3.16b
1128	csel	x1,x1,x4,ne			// conditional rewind
1129	add	v24.2d,v24.2d,v16.2d
1130	ld1	{v25.2d},[x3],#16
1131	ext	v24.16b,v24.16b,v24.16b,#8
1132	ext	v5.16b,v2.16b,v3.16b,#8
1133	ext	v6.16b,v1.16b,v2.16b,#8
1134	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1135.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1136	ext	v7.16b,v20.16b,v21.16b,#8
1137.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1138.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1139	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1140.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1141	add	v25.2d,v25.2d,v17.2d
1142	ld1	{v24.2d},[x3],#16
1143	ext	v25.16b,v25.16b,v25.16b,#8
1144	ext	v5.16b,v4.16b,v2.16b,#8
1145	ext	v6.16b,v0.16b,v4.16b,#8
1146	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1147.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1148	ext	v7.16b,v21.16b,v22.16b,#8
1149.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1150.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1151	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1152.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1153	add	v24.2d,v24.2d,v18.2d
1154	ld1	{v25.2d},[x3],#16
1155	ext	v24.16b,v24.16b,v24.16b,#8
1156	ext	v5.16b,v1.16b,v4.16b,#8
1157	ext	v6.16b,v3.16b,v1.16b,#8
1158	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1159.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1160	ext	v7.16b,v22.16b,v23.16b,#8
1161.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1162.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1163	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1164.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1165	add	v25.2d,v25.2d,v19.2d
1166	ld1	{v24.2d},[x3],#16
1167	ext	v25.16b,v25.16b,v25.16b,#8
1168	ext	v5.16b,v0.16b,v1.16b,#8
1169	ext	v6.16b,v2.16b,v0.16b,#8
1170	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1171.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1172	ext	v7.16b,v23.16b,v16.16b,#8
1173.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1174.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1175	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1176.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1177	add	v24.2d,v24.2d,v20.2d
1178	ld1	{v25.2d},[x3],#16
1179	ext	v24.16b,v24.16b,v24.16b,#8
1180	ext	v5.16b,v3.16b,v0.16b,#8
1181	ext	v6.16b,v4.16b,v3.16b,#8
1182	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1183.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1184	ext	v7.16b,v16.16b,v17.16b,#8
1185.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1186.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1187	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1188.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1189	add	v25.2d,v25.2d,v21.2d
1190	ld1	{v24.2d},[x3],#16
1191	ext	v25.16b,v25.16b,v25.16b,#8
1192	ext	v5.16b,v2.16b,v3.16b,#8
1193	ext	v6.16b,v1.16b,v2.16b,#8
1194	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1195.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1196	ext	v7.16b,v17.16b,v18.16b,#8
1197.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1198.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1199	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1200.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1201	add	v24.2d,v24.2d,v22.2d
1202	ld1	{v25.2d},[x3],#16
1203	ext	v24.16b,v24.16b,v24.16b,#8
1204	ext	v5.16b,v4.16b,v2.16b,#8
1205	ext	v6.16b,v0.16b,v4.16b,#8
1206	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1207.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1208	ext	v7.16b,v18.16b,v19.16b,#8
1209.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1210.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1211	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1212.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1213	add	v25.2d,v25.2d,v23.2d
1214	ld1	{v24.2d},[x3],#16
1215	ext	v25.16b,v25.16b,v25.16b,#8
1216	ext	v5.16b,v1.16b,v4.16b,#8
1217	ext	v6.16b,v3.16b,v1.16b,#8
1218	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1219.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1220	ext	v7.16b,v19.16b,v20.16b,#8
1221.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1222.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1223	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1224.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1225	add	v24.2d,v24.2d,v16.2d
1226	ld1	{v25.2d},[x3],#16
1227	ext	v24.16b,v24.16b,v24.16b,#8
1228	ext	v5.16b,v0.16b,v1.16b,#8
1229	ext	v6.16b,v2.16b,v0.16b,#8
1230	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1231.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1232	ext	v7.16b,v20.16b,v21.16b,#8
1233.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1234.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1235	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1236.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1237	add	v25.2d,v25.2d,v17.2d
1238	ld1	{v24.2d},[x3],#16
1239	ext	v25.16b,v25.16b,v25.16b,#8
1240	ext	v5.16b,v3.16b,v0.16b,#8
1241	ext	v6.16b,v4.16b,v3.16b,#8
1242	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1243.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1244	ext	v7.16b,v21.16b,v22.16b,#8
1245.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1246.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1247	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1248.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1249	add	v24.2d,v24.2d,v18.2d
1250	ld1	{v25.2d},[x3],#16
1251	ext	v24.16b,v24.16b,v24.16b,#8
1252	ext	v5.16b,v2.16b,v3.16b,#8
1253	ext	v6.16b,v1.16b,v2.16b,#8
1254	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1255.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1256	ext	v7.16b,v22.16b,v23.16b,#8
1257.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1258.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1259	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1260.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1261	add	v25.2d,v25.2d,v19.2d
1262	ld1	{v24.2d},[x3],#16
1263	ext	v25.16b,v25.16b,v25.16b,#8
1264	ext	v5.16b,v4.16b,v2.16b,#8
1265	ext	v6.16b,v0.16b,v4.16b,#8
1266	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1267.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1268	ext	v7.16b,v23.16b,v16.16b,#8
1269.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1270.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1271	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1272.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1273	add	v24.2d,v24.2d,v20.2d
1274	ld1	{v25.2d},[x3],#16
1275	ext	v24.16b,v24.16b,v24.16b,#8
1276	ext	v5.16b,v1.16b,v4.16b,#8
1277	ext	v6.16b,v3.16b,v1.16b,#8
1278	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1279.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1280	ext	v7.16b,v16.16b,v17.16b,#8
1281.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1282.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1283	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1284.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1285	add	v25.2d,v25.2d,v21.2d
1286	ld1	{v24.2d},[x3],#16
1287	ext	v25.16b,v25.16b,v25.16b,#8
1288	ext	v5.16b,v0.16b,v1.16b,#8
1289	ext	v6.16b,v2.16b,v0.16b,#8
1290	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1291.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1292	ext	v7.16b,v17.16b,v18.16b,#8
1293.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1294.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1295	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1296.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1297	add	v24.2d,v24.2d,v22.2d
1298	ld1	{v25.2d},[x3],#16
1299	ext	v24.16b,v24.16b,v24.16b,#8
1300	ext	v5.16b,v3.16b,v0.16b,#8
1301	ext	v6.16b,v4.16b,v3.16b,#8
1302	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1303.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1304	ext	v7.16b,v18.16b,v19.16b,#8
1305.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1306.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1307	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1308.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1309	add	v25.2d,v25.2d,v23.2d
1310	ld1	{v24.2d},[x3],#16
1311	ext	v25.16b,v25.16b,v25.16b,#8
1312	ext	v5.16b,v2.16b,v3.16b,#8
1313	ext	v6.16b,v1.16b,v2.16b,#8
1314	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1315.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1316	ext	v7.16b,v19.16b,v20.16b,#8
1317.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1318.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1319	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1320.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1321	add	v24.2d,v24.2d,v16.2d
1322	ld1	{v25.2d},[x3],#16
1323	ext	v24.16b,v24.16b,v24.16b,#8
1324	ext	v5.16b,v4.16b,v2.16b,#8
1325	ext	v6.16b,v0.16b,v4.16b,#8
1326	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1327.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1328	ext	v7.16b,v20.16b,v21.16b,#8
1329.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1330.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1331	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1332.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1333	add	v25.2d,v25.2d,v17.2d
1334	ld1	{v24.2d},[x3],#16
1335	ext	v25.16b,v25.16b,v25.16b,#8
1336	ext	v5.16b,v1.16b,v4.16b,#8
1337	ext	v6.16b,v3.16b,v1.16b,#8
1338	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1339.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1340	ext	v7.16b,v21.16b,v22.16b,#8
1341.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1342.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1343	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1344.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1345	add	v24.2d,v24.2d,v18.2d
1346	ld1	{v25.2d},[x3],#16
1347	ext	v24.16b,v24.16b,v24.16b,#8
1348	ext	v5.16b,v0.16b,v1.16b,#8
1349	ext	v6.16b,v2.16b,v0.16b,#8
1350	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1351.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1352	ext	v7.16b,v22.16b,v23.16b,#8
1353.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1354.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1355	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1356.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1357	add	v25.2d,v25.2d,v19.2d
1358	ld1	{v24.2d},[x3],#16
1359	ext	v25.16b,v25.16b,v25.16b,#8
1360	ext	v5.16b,v3.16b,v0.16b,#8
1361	ext	v6.16b,v4.16b,v3.16b,#8
1362	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1363.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1364	ext	v7.16b,v23.16b,v16.16b,#8
1365.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1366.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1367	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1368.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1369	add	v24.2d,v24.2d,v20.2d
1370	ld1	{v25.2d},[x3],#16
1371	ext	v24.16b,v24.16b,v24.16b,#8
1372	ext	v5.16b,v2.16b,v3.16b,#8
1373	ext	v6.16b,v1.16b,v2.16b,#8
1374	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1375.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1376	ext	v7.16b,v16.16b,v17.16b,#8
1377.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1378.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1379	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1380.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1381	add	v25.2d,v25.2d,v21.2d
1382	ld1	{v24.2d},[x3],#16
1383	ext	v25.16b,v25.16b,v25.16b,#8
1384	ext	v5.16b,v4.16b,v2.16b,#8
1385	ext	v6.16b,v0.16b,v4.16b,#8
1386	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1387.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1388	ext	v7.16b,v17.16b,v18.16b,#8
1389.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1390.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1391	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1392.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1393	add	v24.2d,v24.2d,v22.2d
1394	ld1	{v25.2d},[x3],#16
1395	ext	v24.16b,v24.16b,v24.16b,#8
1396	ext	v5.16b,v1.16b,v4.16b,#8
1397	ext	v6.16b,v3.16b,v1.16b,#8
1398	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1399.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1400	ext	v7.16b,v18.16b,v19.16b,#8
1401.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1402.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1403	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1404.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1405	add	v25.2d,v25.2d,v23.2d
1406	ld1	{v24.2d},[x3],#16
1407	ext	v25.16b,v25.16b,v25.16b,#8
1408	ext	v5.16b,v0.16b,v1.16b,#8
1409	ext	v6.16b,v2.16b,v0.16b,#8
1410	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1411.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1412	ext	v7.16b,v19.16b,v20.16b,#8
1413.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1414.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1415	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1416.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1417	add	v24.2d,v24.2d,v16.2d
1418	ld1	{v25.2d},[x3],#16
1419	ext	v24.16b,v24.16b,v24.16b,#8
1420	ext	v5.16b,v3.16b,v0.16b,#8
1421	ext	v6.16b,v4.16b,v3.16b,#8
1422	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1423.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1424	ext	v7.16b,v20.16b,v21.16b,#8
1425.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1426.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1427	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1428.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1429	add	v25.2d,v25.2d,v17.2d
1430	ld1	{v24.2d},[x3],#16
1431	ext	v25.16b,v25.16b,v25.16b,#8
1432	ext	v5.16b,v2.16b,v3.16b,#8
1433	ext	v6.16b,v1.16b,v2.16b,#8
1434	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1435.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1436	ext	v7.16b,v21.16b,v22.16b,#8
1437.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1438.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1439	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1440.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1441	add	v24.2d,v24.2d,v18.2d
1442	ld1	{v25.2d},[x3],#16
1443	ext	v24.16b,v24.16b,v24.16b,#8
1444	ext	v5.16b,v4.16b,v2.16b,#8
1445	ext	v6.16b,v0.16b,v4.16b,#8
1446	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1447.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1448	ext	v7.16b,v22.16b,v23.16b,#8
1449.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1450.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1451	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1452.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1453	add	v25.2d,v25.2d,v19.2d
1454	ld1	{v24.2d},[x3],#16
1455	ext	v25.16b,v25.16b,v25.16b,#8
1456	ext	v5.16b,v1.16b,v4.16b,#8
1457	ext	v6.16b,v3.16b,v1.16b,#8
1458	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1459.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1460	ext	v7.16b,v23.16b,v16.16b,#8
1461.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1462.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1463	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1464.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1465	add	v24.2d,v24.2d,v20.2d
1466	ld1	{v25.2d},[x3],#16
1467	ext	v24.16b,v24.16b,v24.16b,#8
1468	ext	v5.16b,v0.16b,v1.16b,#8
1469	ext	v6.16b,v2.16b,v0.16b,#8
1470	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1471.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1472	ext	v7.16b,v16.16b,v17.16b,#8
1473.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1474.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1475	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1476.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1477	add	v25.2d,v25.2d,v21.2d
1478	ld1	{v24.2d},[x3],#16
1479	ext	v25.16b,v25.16b,v25.16b,#8
1480	ext	v5.16b,v3.16b,v0.16b,#8
1481	ext	v6.16b,v4.16b,v3.16b,#8
1482	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1483.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1484	ext	v7.16b,v17.16b,v18.16b,#8
1485.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1486.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1487	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1488.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1489	add	v24.2d,v24.2d,v22.2d
1490	ld1	{v25.2d},[x3],#16
1491	ext	v24.16b,v24.16b,v24.16b,#8
1492	ext	v5.16b,v2.16b,v3.16b,#8
1493	ext	v6.16b,v1.16b,v2.16b,#8
1494	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1495.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1496	ext	v7.16b,v18.16b,v19.16b,#8
1497.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1498.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1499	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1500.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1501	add	v25.2d,v25.2d,v23.2d
1502	ld1	{v24.2d},[x3],#16
1503	ext	v25.16b,v25.16b,v25.16b,#8
1504	ext	v5.16b,v4.16b,v2.16b,#8
1505	ext	v6.16b,v0.16b,v4.16b,#8
1506	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1507.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1508	ext	v7.16b,v19.16b,v20.16b,#8
1509.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1510.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1511	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1512.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1513	ld1	{v25.2d},[x3],#16
1514	add	v24.2d,v24.2d,v16.2d
1515	ld1	{v16.16b},[x1],#16		// load next input
1516	ext	v24.16b,v24.16b,v24.16b,#8
1517	ext	v5.16b,v1.16b,v4.16b,#8
1518	ext	v6.16b,v3.16b,v1.16b,#8
1519	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1520.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1521	rev64	v16.16b,v16.16b
1522	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1523.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1524	ld1	{v24.2d},[x3],#16
1525	add	v25.2d,v25.2d,v17.2d
1526	ld1	{v17.16b},[x1],#16		// load next input
1527	ext	v25.16b,v25.16b,v25.16b,#8
1528	ext	v5.16b,v0.16b,v1.16b,#8
1529	ext	v6.16b,v2.16b,v0.16b,#8
1530	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1531.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1532	rev64	v17.16b,v17.16b
1533	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1534.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1535	ld1	{v25.2d},[x3],#16
1536	add	v24.2d,v24.2d,v18.2d
1537	ld1	{v18.16b},[x1],#16		// load next input
1538	ext	v24.16b,v24.16b,v24.16b,#8
1539	ext	v5.16b,v3.16b,v0.16b,#8
1540	ext	v6.16b,v4.16b,v3.16b,#8
1541	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1542.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1543	rev64	v18.16b,v18.16b
1544	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1545.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1546	ld1	{v24.2d},[x3],#16
1547	add	v25.2d,v25.2d,v19.2d
1548	ld1	{v19.16b},[x1],#16		// load next input
1549	ext	v25.16b,v25.16b,v25.16b,#8
1550	ext	v5.16b,v2.16b,v3.16b,#8
1551	ext	v6.16b,v1.16b,v2.16b,#8
1552	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1553.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1554	rev64	v19.16b,v19.16b
1555	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1556.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1557	ld1	{v25.2d},[x3],#16
1558	add	v24.2d,v24.2d,v20.2d
1559	ld1	{v20.16b},[x1],#16		// load next input
1560	ext	v24.16b,v24.16b,v24.16b,#8
1561	ext	v5.16b,v4.16b,v2.16b,#8
1562	ext	v6.16b,v0.16b,v4.16b,#8
1563	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1564.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1565	rev64	v20.16b,v20.16b
1566	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1567.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1568	ld1	{v24.2d},[x3],#16
1569	add	v25.2d,v25.2d,v21.2d
1570	ld1	{v21.16b},[x1],#16		// load next input
1571	ext	v25.16b,v25.16b,v25.16b,#8
1572	ext	v5.16b,v1.16b,v4.16b,#8
1573	ext	v6.16b,v3.16b,v1.16b,#8
1574	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1575.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1576	rev64	v21.16b,v21.16b
1577	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1578.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1579	ld1	{v25.2d},[x3],#16
1580	add	v24.2d,v24.2d,v22.2d
1581	ld1	{v22.16b},[x1],#16		// load next input
1582	ext	v24.16b,v24.16b,v24.16b,#8
1583	ext	v5.16b,v0.16b,v1.16b,#8
1584	ext	v6.16b,v2.16b,v0.16b,#8
1585	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1586.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1587	rev64	v22.16b,v22.16b
1588	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1589.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1590	sub	x3,x3,#80*8	// rewind
1591	add	v25.2d,v25.2d,v23.2d
1592	ld1	{v23.16b},[x1],#16		// load next input
1593	ext	v25.16b,v25.16b,v25.16b,#8
1594	ext	v5.16b,v3.16b,v0.16b,#8
1595	ext	v6.16b,v4.16b,v3.16b,#8
1596	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1597.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1598	rev64	v23.16b,v23.16b
1599	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1600.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1601	add	v0.2d,v0.2d,v26.2d			// accumulate
1602	add	v1.2d,v1.2d,v27.2d
1603	add	v2.2d,v2.2d,v28.2d
1604	add	v3.2d,v3.2d,v29.2d
1605
1606	cbnz	x2,Loop_hw
1607
1608	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1609
1610	ldr	x29,[sp],#16
1611	ret
1612
1613#endif
1614#endif  // !OPENSSL_NO_ASM
1615