• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(_WIN32)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
16//
17// Licensed under the OpenSSL license (the "License").  You may not use
18// this file except in compliance with the License.  You can obtain a copy
19// in the file LICENSE in the source distribution or at
20// https://www.openssl.org/source/license.html
21
22// ====================================================================
23// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
24// project. The module is, however, dual licensed under OpenSSL and
25// CRYPTOGAMS licenses depending on where you obtain it. For further
26// details see http://www.openssl.org/~appro/cryptogams/.
27//
28// Permission to use under GPLv2 terms is granted.
29// ====================================================================
30//
31// SHA256/512 for ARMv8.
32//
33// Performance in cycles per processed byte and improvement coefficient
34// over code generated with "default" compiler:
35//
36//		SHA256-hw	SHA256(*)	SHA512
37// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
38// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
39// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
40// Denver	2.01		10.5 (+26%)	6.70 (+8%)
41// X-Gene			20.0 (+100%)	12.8 (+300%(***))
42// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
43// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
44//
45// (*)	Software SHA256 results are of lesser relevance, presented
46//	mostly for informational purposes.
47// (**)	The result is a trade-off: it's possible to improve it by
48//	10% (or by 1 cycle per round), but at the cost of 20% loss
49//	on Cortex-A53 (or by 4 cycles per round).
50// (***)	Super-impressive coefficients over gcc-generated code are
51//	indication of some compiler "pathology", most notably code
52//	generated with -mgeneral-regs-only is significantly faster
53//	and the gap is only 40-90%.
54
55#ifndef	__KERNEL__
56# include <openssl/arm_arch.h>
57#endif
58
59.text
60
61
62
63.globl	sha512_block_data_order
64
65.def sha512_block_data_order
66   .type 32
67.endef
68.align	6
69sha512_block_data_order:
70	AARCH64_VALID_CALL_TARGET
71#ifndef	__KERNEL__
72#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
73	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
74#else
75	adrp	x16,OPENSSL_armcap_P
76#endif
77	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
78	tst	w16,#ARMV8_SHA512
79	b.ne	Lv8_entry
80#endif
81	AARCH64_SIGN_LINK_REGISTER
82	stp	x29,x30,[sp,#-128]!
83	add	x29,sp,#0
84
85	stp	x19,x20,[sp,#16]
86	stp	x21,x22,[sp,#32]
87	stp	x23,x24,[sp,#48]
88	stp	x25,x26,[sp,#64]
89	stp	x27,x28,[sp,#80]
90	sub	sp,sp,#4*8
91
92	ldp	x20,x21,[x0]				// load context
93	ldp	x22,x23,[x0,#2*8]
94	ldp	x24,x25,[x0,#4*8]
95	add	x2,x1,x2,lsl#7	// end of input
96	ldp	x26,x27,[x0,#6*8]
97	adrp	x30,LK512
98	add	x30,x30,:lo12:LK512
99	stp	x0,x2,[x29,#96]
100
101Loop:
102	ldp	x3,x4,[x1],#2*8
103	ldr	x19,[x30],#8			// *K++
104	eor	x28,x21,x22				// magic seed
105	str	x1,[x29,#112]
106#ifndef	__AARCH64EB__
107	rev	x3,x3			// 0
108#endif
109	ror	x16,x24,#14
110	add	x27,x27,x19			// h+=K[i]
111	eor	x6,x24,x24,ror#23
112	and	x17,x25,x24
113	bic	x19,x26,x24
114	add	x27,x27,x3			// h+=X[i]
115	orr	x17,x17,x19			// Ch(e,f,g)
116	eor	x19,x20,x21			// a^b, b^c in next round
117	eor	x16,x16,x6,ror#18	// Sigma1(e)
118	ror	x6,x20,#28
119	add	x27,x27,x17			// h+=Ch(e,f,g)
120	eor	x17,x20,x20,ror#5
121	add	x27,x27,x16			// h+=Sigma1(e)
122	and	x28,x28,x19			// (b^c)&=(a^b)
123	add	x23,x23,x27			// d+=h
124	eor	x28,x28,x21			// Maj(a,b,c)
125	eor	x17,x6,x17,ror#34	// Sigma0(a)
126	add	x27,x27,x28			// h+=Maj(a,b,c)
127	ldr	x28,[x30],#8		// *K++, x19 in next round
128	//add	x27,x27,x17			// h+=Sigma0(a)
129#ifndef	__AARCH64EB__
130	rev	x4,x4			// 1
131#endif
132	ldp	x5,x6,[x1],#2*8
133	add	x27,x27,x17			// h+=Sigma0(a)
134	ror	x16,x23,#14
135	add	x26,x26,x28			// h+=K[i]
136	eor	x7,x23,x23,ror#23
137	and	x17,x24,x23
138	bic	x28,x25,x23
139	add	x26,x26,x4			// h+=X[i]
140	orr	x17,x17,x28			// Ch(e,f,g)
141	eor	x28,x27,x20			// a^b, b^c in next round
142	eor	x16,x16,x7,ror#18	// Sigma1(e)
143	ror	x7,x27,#28
144	add	x26,x26,x17			// h+=Ch(e,f,g)
145	eor	x17,x27,x27,ror#5
146	add	x26,x26,x16			// h+=Sigma1(e)
147	and	x19,x19,x28			// (b^c)&=(a^b)
148	add	x22,x22,x26			// d+=h
149	eor	x19,x19,x20			// Maj(a,b,c)
150	eor	x17,x7,x17,ror#34	// Sigma0(a)
151	add	x26,x26,x19			// h+=Maj(a,b,c)
152	ldr	x19,[x30],#8		// *K++, x28 in next round
153	//add	x26,x26,x17			// h+=Sigma0(a)
154#ifndef	__AARCH64EB__
155	rev	x5,x5			// 2
156#endif
157	add	x26,x26,x17			// h+=Sigma0(a)
158	ror	x16,x22,#14
159	add	x25,x25,x19			// h+=K[i]
160	eor	x8,x22,x22,ror#23
161	and	x17,x23,x22
162	bic	x19,x24,x22
163	add	x25,x25,x5			// h+=X[i]
164	orr	x17,x17,x19			// Ch(e,f,g)
165	eor	x19,x26,x27			// a^b, b^c in next round
166	eor	x16,x16,x8,ror#18	// Sigma1(e)
167	ror	x8,x26,#28
168	add	x25,x25,x17			// h+=Ch(e,f,g)
169	eor	x17,x26,x26,ror#5
170	add	x25,x25,x16			// h+=Sigma1(e)
171	and	x28,x28,x19			// (b^c)&=(a^b)
172	add	x21,x21,x25			// d+=h
173	eor	x28,x28,x27			// Maj(a,b,c)
174	eor	x17,x8,x17,ror#34	// Sigma0(a)
175	add	x25,x25,x28			// h+=Maj(a,b,c)
176	ldr	x28,[x30],#8		// *K++, x19 in next round
177	//add	x25,x25,x17			// h+=Sigma0(a)
178#ifndef	__AARCH64EB__
179	rev	x6,x6			// 3
180#endif
181	ldp	x7,x8,[x1],#2*8
182	add	x25,x25,x17			// h+=Sigma0(a)
183	ror	x16,x21,#14
184	add	x24,x24,x28			// h+=K[i]
185	eor	x9,x21,x21,ror#23
186	and	x17,x22,x21
187	bic	x28,x23,x21
188	add	x24,x24,x6			// h+=X[i]
189	orr	x17,x17,x28			// Ch(e,f,g)
190	eor	x28,x25,x26			// a^b, b^c in next round
191	eor	x16,x16,x9,ror#18	// Sigma1(e)
192	ror	x9,x25,#28
193	add	x24,x24,x17			// h+=Ch(e,f,g)
194	eor	x17,x25,x25,ror#5
195	add	x24,x24,x16			// h+=Sigma1(e)
196	and	x19,x19,x28			// (b^c)&=(a^b)
197	add	x20,x20,x24			// d+=h
198	eor	x19,x19,x26			// Maj(a,b,c)
199	eor	x17,x9,x17,ror#34	// Sigma0(a)
200	add	x24,x24,x19			// h+=Maj(a,b,c)
201	ldr	x19,[x30],#8		// *K++, x28 in next round
202	//add	x24,x24,x17			// h+=Sigma0(a)
203#ifndef	__AARCH64EB__
204	rev	x7,x7			// 4
205#endif
206	add	x24,x24,x17			// h+=Sigma0(a)
207	ror	x16,x20,#14
208	add	x23,x23,x19			// h+=K[i]
209	eor	x10,x20,x20,ror#23
210	and	x17,x21,x20
211	bic	x19,x22,x20
212	add	x23,x23,x7			// h+=X[i]
213	orr	x17,x17,x19			// Ch(e,f,g)
214	eor	x19,x24,x25			// a^b, b^c in next round
215	eor	x16,x16,x10,ror#18	// Sigma1(e)
216	ror	x10,x24,#28
217	add	x23,x23,x17			// h+=Ch(e,f,g)
218	eor	x17,x24,x24,ror#5
219	add	x23,x23,x16			// h+=Sigma1(e)
220	and	x28,x28,x19			// (b^c)&=(a^b)
221	add	x27,x27,x23			// d+=h
222	eor	x28,x28,x25			// Maj(a,b,c)
223	eor	x17,x10,x17,ror#34	// Sigma0(a)
224	add	x23,x23,x28			// h+=Maj(a,b,c)
225	ldr	x28,[x30],#8		// *K++, x19 in next round
226	//add	x23,x23,x17			// h+=Sigma0(a)
227#ifndef	__AARCH64EB__
228	rev	x8,x8			// 5
229#endif
230	ldp	x9,x10,[x1],#2*8
231	add	x23,x23,x17			// h+=Sigma0(a)
232	ror	x16,x27,#14
233	add	x22,x22,x28			// h+=K[i]
234	eor	x11,x27,x27,ror#23
235	and	x17,x20,x27
236	bic	x28,x21,x27
237	add	x22,x22,x8			// h+=X[i]
238	orr	x17,x17,x28			// Ch(e,f,g)
239	eor	x28,x23,x24			// a^b, b^c in next round
240	eor	x16,x16,x11,ror#18	// Sigma1(e)
241	ror	x11,x23,#28
242	add	x22,x22,x17			// h+=Ch(e,f,g)
243	eor	x17,x23,x23,ror#5
244	add	x22,x22,x16			// h+=Sigma1(e)
245	and	x19,x19,x28			// (b^c)&=(a^b)
246	add	x26,x26,x22			// d+=h
247	eor	x19,x19,x24			// Maj(a,b,c)
248	eor	x17,x11,x17,ror#34	// Sigma0(a)
249	add	x22,x22,x19			// h+=Maj(a,b,c)
250	ldr	x19,[x30],#8		// *K++, x28 in next round
251	//add	x22,x22,x17			// h+=Sigma0(a)
252#ifndef	__AARCH64EB__
253	rev	x9,x9			// 6
254#endif
255	add	x22,x22,x17			// h+=Sigma0(a)
256	ror	x16,x26,#14
257	add	x21,x21,x19			// h+=K[i]
258	eor	x12,x26,x26,ror#23
259	and	x17,x27,x26
260	bic	x19,x20,x26
261	add	x21,x21,x9			// h+=X[i]
262	orr	x17,x17,x19			// Ch(e,f,g)
263	eor	x19,x22,x23			// a^b, b^c in next round
264	eor	x16,x16,x12,ror#18	// Sigma1(e)
265	ror	x12,x22,#28
266	add	x21,x21,x17			// h+=Ch(e,f,g)
267	eor	x17,x22,x22,ror#5
268	add	x21,x21,x16			// h+=Sigma1(e)
269	and	x28,x28,x19			// (b^c)&=(a^b)
270	add	x25,x25,x21			// d+=h
271	eor	x28,x28,x23			// Maj(a,b,c)
272	eor	x17,x12,x17,ror#34	// Sigma0(a)
273	add	x21,x21,x28			// h+=Maj(a,b,c)
274	ldr	x28,[x30],#8		// *K++, x19 in next round
275	//add	x21,x21,x17			// h+=Sigma0(a)
276#ifndef	__AARCH64EB__
277	rev	x10,x10			// 7
278#endif
279	ldp	x11,x12,[x1],#2*8
280	add	x21,x21,x17			// h+=Sigma0(a)
281	ror	x16,x25,#14
282	add	x20,x20,x28			// h+=K[i]
283	eor	x13,x25,x25,ror#23
284	and	x17,x26,x25
285	bic	x28,x27,x25
286	add	x20,x20,x10			// h+=X[i]
287	orr	x17,x17,x28			// Ch(e,f,g)
288	eor	x28,x21,x22			// a^b, b^c in next round
289	eor	x16,x16,x13,ror#18	// Sigma1(e)
290	ror	x13,x21,#28
291	add	x20,x20,x17			// h+=Ch(e,f,g)
292	eor	x17,x21,x21,ror#5
293	add	x20,x20,x16			// h+=Sigma1(e)
294	and	x19,x19,x28			// (b^c)&=(a^b)
295	add	x24,x24,x20			// d+=h
296	eor	x19,x19,x22			// Maj(a,b,c)
297	eor	x17,x13,x17,ror#34	// Sigma0(a)
298	add	x20,x20,x19			// h+=Maj(a,b,c)
299	ldr	x19,[x30],#8		// *K++, x28 in next round
300	//add	x20,x20,x17			// h+=Sigma0(a)
301#ifndef	__AARCH64EB__
302	rev	x11,x11			// 8
303#endif
304	add	x20,x20,x17			// h+=Sigma0(a)
305	ror	x16,x24,#14
306	add	x27,x27,x19			// h+=K[i]
307	eor	x14,x24,x24,ror#23
308	and	x17,x25,x24
309	bic	x19,x26,x24
310	add	x27,x27,x11			// h+=X[i]
311	orr	x17,x17,x19			// Ch(e,f,g)
312	eor	x19,x20,x21			// a^b, b^c in next round
313	eor	x16,x16,x14,ror#18	// Sigma1(e)
314	ror	x14,x20,#28
315	add	x27,x27,x17			// h+=Ch(e,f,g)
316	eor	x17,x20,x20,ror#5
317	add	x27,x27,x16			// h+=Sigma1(e)
318	and	x28,x28,x19			// (b^c)&=(a^b)
319	add	x23,x23,x27			// d+=h
320	eor	x28,x28,x21			// Maj(a,b,c)
321	eor	x17,x14,x17,ror#34	// Sigma0(a)
322	add	x27,x27,x28			// h+=Maj(a,b,c)
323	ldr	x28,[x30],#8		// *K++, x19 in next round
324	//add	x27,x27,x17			// h+=Sigma0(a)
325#ifndef	__AARCH64EB__
326	rev	x12,x12			// 9
327#endif
328	ldp	x13,x14,[x1],#2*8
329	add	x27,x27,x17			// h+=Sigma0(a)
330	ror	x16,x23,#14
331	add	x26,x26,x28			// h+=K[i]
332	eor	x15,x23,x23,ror#23
333	and	x17,x24,x23
334	bic	x28,x25,x23
335	add	x26,x26,x12			// h+=X[i]
336	orr	x17,x17,x28			// Ch(e,f,g)
337	eor	x28,x27,x20			// a^b, b^c in next round
338	eor	x16,x16,x15,ror#18	// Sigma1(e)
339	ror	x15,x27,#28
340	add	x26,x26,x17			// h+=Ch(e,f,g)
341	eor	x17,x27,x27,ror#5
342	add	x26,x26,x16			// h+=Sigma1(e)
343	and	x19,x19,x28			// (b^c)&=(a^b)
344	add	x22,x22,x26			// d+=h
345	eor	x19,x19,x20			// Maj(a,b,c)
346	eor	x17,x15,x17,ror#34	// Sigma0(a)
347	add	x26,x26,x19			// h+=Maj(a,b,c)
348	ldr	x19,[x30],#8		// *K++, x28 in next round
349	//add	x26,x26,x17			// h+=Sigma0(a)
350#ifndef	__AARCH64EB__
351	rev	x13,x13			// 10
352#endif
353	add	x26,x26,x17			// h+=Sigma0(a)
354	ror	x16,x22,#14
355	add	x25,x25,x19			// h+=K[i]
356	eor	x0,x22,x22,ror#23
357	and	x17,x23,x22
358	bic	x19,x24,x22
359	add	x25,x25,x13			// h+=X[i]
360	orr	x17,x17,x19			// Ch(e,f,g)
361	eor	x19,x26,x27			// a^b, b^c in next round
362	eor	x16,x16,x0,ror#18	// Sigma1(e)
363	ror	x0,x26,#28
364	add	x25,x25,x17			// h+=Ch(e,f,g)
365	eor	x17,x26,x26,ror#5
366	add	x25,x25,x16			// h+=Sigma1(e)
367	and	x28,x28,x19			// (b^c)&=(a^b)
368	add	x21,x21,x25			// d+=h
369	eor	x28,x28,x27			// Maj(a,b,c)
370	eor	x17,x0,x17,ror#34	// Sigma0(a)
371	add	x25,x25,x28			// h+=Maj(a,b,c)
372	ldr	x28,[x30],#8		// *K++, x19 in next round
373	//add	x25,x25,x17			// h+=Sigma0(a)
374#ifndef	__AARCH64EB__
375	rev	x14,x14			// 11
376#endif
377	ldp	x15,x0,[x1],#2*8
378	add	x25,x25,x17			// h+=Sigma0(a)
379	str	x6,[sp,#24]
380	ror	x16,x21,#14
381	add	x24,x24,x28			// h+=K[i]
382	eor	x6,x21,x21,ror#23
383	and	x17,x22,x21
384	bic	x28,x23,x21
385	add	x24,x24,x14			// h+=X[i]
386	orr	x17,x17,x28			// Ch(e,f,g)
387	eor	x28,x25,x26			// a^b, b^c in next round
388	eor	x16,x16,x6,ror#18	// Sigma1(e)
389	ror	x6,x25,#28
390	add	x24,x24,x17			// h+=Ch(e,f,g)
391	eor	x17,x25,x25,ror#5
392	add	x24,x24,x16			// h+=Sigma1(e)
393	and	x19,x19,x28			// (b^c)&=(a^b)
394	add	x20,x20,x24			// d+=h
395	eor	x19,x19,x26			// Maj(a,b,c)
396	eor	x17,x6,x17,ror#34	// Sigma0(a)
397	add	x24,x24,x19			// h+=Maj(a,b,c)
398	ldr	x19,[x30],#8		// *K++, x28 in next round
399	//add	x24,x24,x17			// h+=Sigma0(a)
400#ifndef	__AARCH64EB__
401	rev	x15,x15			// 12
402#endif
403	add	x24,x24,x17			// h+=Sigma0(a)
404	str	x7,[sp,#0]
405	ror	x16,x20,#14
406	add	x23,x23,x19			// h+=K[i]
407	eor	x7,x20,x20,ror#23
408	and	x17,x21,x20
409	bic	x19,x22,x20
410	add	x23,x23,x15			// h+=X[i]
411	orr	x17,x17,x19			// Ch(e,f,g)
412	eor	x19,x24,x25			// a^b, b^c in next round
413	eor	x16,x16,x7,ror#18	// Sigma1(e)
414	ror	x7,x24,#28
415	add	x23,x23,x17			// h+=Ch(e,f,g)
416	eor	x17,x24,x24,ror#5
417	add	x23,x23,x16			// h+=Sigma1(e)
418	and	x28,x28,x19			// (b^c)&=(a^b)
419	add	x27,x27,x23			// d+=h
420	eor	x28,x28,x25			// Maj(a,b,c)
421	eor	x17,x7,x17,ror#34	// Sigma0(a)
422	add	x23,x23,x28			// h+=Maj(a,b,c)
423	ldr	x28,[x30],#8		// *K++, x19 in next round
424	//add	x23,x23,x17			// h+=Sigma0(a)
425#ifndef	__AARCH64EB__
426	rev	x0,x0			// 13
427#endif
428	ldp	x1,x2,[x1]
429	add	x23,x23,x17			// h+=Sigma0(a)
430	str	x8,[sp,#8]
431	ror	x16,x27,#14
432	add	x22,x22,x28			// h+=K[i]
433	eor	x8,x27,x27,ror#23
434	and	x17,x20,x27
435	bic	x28,x21,x27
436	add	x22,x22,x0			// h+=X[i]
437	orr	x17,x17,x28			// Ch(e,f,g)
438	eor	x28,x23,x24			// a^b, b^c in next round
439	eor	x16,x16,x8,ror#18	// Sigma1(e)
440	ror	x8,x23,#28
441	add	x22,x22,x17			// h+=Ch(e,f,g)
442	eor	x17,x23,x23,ror#5
443	add	x22,x22,x16			// h+=Sigma1(e)
444	and	x19,x19,x28			// (b^c)&=(a^b)
445	add	x26,x26,x22			// d+=h
446	eor	x19,x19,x24			// Maj(a,b,c)
447	eor	x17,x8,x17,ror#34	// Sigma0(a)
448	add	x22,x22,x19			// h+=Maj(a,b,c)
449	ldr	x19,[x30],#8		// *K++, x28 in next round
450	//add	x22,x22,x17			// h+=Sigma0(a)
451#ifndef	__AARCH64EB__
452	rev	x1,x1			// 14
453#endif
454	ldr	x6,[sp,#24]
455	add	x22,x22,x17			// h+=Sigma0(a)
456	str	x9,[sp,#16]
457	ror	x16,x26,#14
458	add	x21,x21,x19			// h+=K[i]
459	eor	x9,x26,x26,ror#23
460	and	x17,x27,x26
461	bic	x19,x20,x26
462	add	x21,x21,x1			// h+=X[i]
463	orr	x17,x17,x19			// Ch(e,f,g)
464	eor	x19,x22,x23			// a^b, b^c in next round
465	eor	x16,x16,x9,ror#18	// Sigma1(e)
466	ror	x9,x22,#28
467	add	x21,x21,x17			// h+=Ch(e,f,g)
468	eor	x17,x22,x22,ror#5
469	add	x21,x21,x16			// h+=Sigma1(e)
470	and	x28,x28,x19			// (b^c)&=(a^b)
471	add	x25,x25,x21			// d+=h
472	eor	x28,x28,x23			// Maj(a,b,c)
473	eor	x17,x9,x17,ror#34	// Sigma0(a)
474	add	x21,x21,x28			// h+=Maj(a,b,c)
475	ldr	x28,[x30],#8		// *K++, x19 in next round
476	//add	x21,x21,x17			// h+=Sigma0(a)
477#ifndef	__AARCH64EB__
478	rev	x2,x2			// 15
479#endif
480	ldr	x7,[sp,#0]
481	add	x21,x21,x17			// h+=Sigma0(a)
482	str	x10,[sp,#24]
483	ror	x16,x25,#14
484	add	x20,x20,x28			// h+=K[i]
485	ror	x9,x4,#1
486	and	x17,x26,x25
487	ror	x8,x1,#19
488	bic	x28,x27,x25
489	ror	x10,x21,#28
490	add	x20,x20,x2			// h+=X[i]
491	eor	x16,x16,x25,ror#18
492	eor	x9,x9,x4,ror#8
493	orr	x17,x17,x28			// Ch(e,f,g)
494	eor	x28,x21,x22			// a^b, b^c in next round
495	eor	x16,x16,x25,ror#41	// Sigma1(e)
496	eor	x10,x10,x21,ror#34
497	add	x20,x20,x17			// h+=Ch(e,f,g)
498	and	x19,x19,x28			// (b^c)&=(a^b)
499	eor	x8,x8,x1,ror#61
500	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
501	add	x20,x20,x16			// h+=Sigma1(e)
502	eor	x19,x19,x22			// Maj(a,b,c)
503	eor	x17,x10,x21,ror#39	// Sigma0(a)
504	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
505	add	x3,x3,x12
506	add	x24,x24,x20			// d+=h
507	add	x20,x20,x19			// h+=Maj(a,b,c)
508	ldr	x19,[x30],#8		// *K++, x28 in next round
509	add	x3,x3,x9
510	add	x20,x20,x17			// h+=Sigma0(a)
511	add	x3,x3,x8
512Loop_16_xx:
513	ldr	x8,[sp,#8]
514	str	x11,[sp,#0]
515	ror	x16,x24,#14
516	add	x27,x27,x19			// h+=K[i]
517	ror	x10,x5,#1
518	and	x17,x25,x24
519	ror	x9,x2,#19
520	bic	x19,x26,x24
521	ror	x11,x20,#28
522	add	x27,x27,x3			// h+=X[i]
523	eor	x16,x16,x24,ror#18
524	eor	x10,x10,x5,ror#8
525	orr	x17,x17,x19			// Ch(e,f,g)
526	eor	x19,x20,x21			// a^b, b^c in next round
527	eor	x16,x16,x24,ror#41	// Sigma1(e)
528	eor	x11,x11,x20,ror#34
529	add	x27,x27,x17			// h+=Ch(e,f,g)
530	and	x28,x28,x19			// (b^c)&=(a^b)
531	eor	x9,x9,x2,ror#61
532	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
533	add	x27,x27,x16			// h+=Sigma1(e)
534	eor	x28,x28,x21			// Maj(a,b,c)
535	eor	x17,x11,x20,ror#39	// Sigma0(a)
536	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
537	add	x4,x4,x13
538	add	x23,x23,x27			// d+=h
539	add	x27,x27,x28			// h+=Maj(a,b,c)
540	ldr	x28,[x30],#8		// *K++, x19 in next round
541	add	x4,x4,x10
542	add	x27,x27,x17			// h+=Sigma0(a)
543	add	x4,x4,x9
544	ldr	x9,[sp,#16]
545	str	x12,[sp,#8]
546	ror	x16,x23,#14
547	add	x26,x26,x28			// h+=K[i]
548	ror	x11,x6,#1
549	and	x17,x24,x23
550	ror	x10,x3,#19
551	bic	x28,x25,x23
552	ror	x12,x27,#28
553	add	x26,x26,x4			// h+=X[i]
554	eor	x16,x16,x23,ror#18
555	eor	x11,x11,x6,ror#8
556	orr	x17,x17,x28			// Ch(e,f,g)
557	eor	x28,x27,x20			// a^b, b^c in next round
558	eor	x16,x16,x23,ror#41	// Sigma1(e)
559	eor	x12,x12,x27,ror#34
560	add	x26,x26,x17			// h+=Ch(e,f,g)
561	and	x19,x19,x28			// (b^c)&=(a^b)
562	eor	x10,x10,x3,ror#61
563	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
564	add	x26,x26,x16			// h+=Sigma1(e)
565	eor	x19,x19,x20			// Maj(a,b,c)
566	eor	x17,x12,x27,ror#39	// Sigma0(a)
567	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
568	add	x5,x5,x14
569	add	x22,x22,x26			// d+=h
570	add	x26,x26,x19			// h+=Maj(a,b,c)
571	ldr	x19,[x30],#8		// *K++, x28 in next round
572	add	x5,x5,x11
573	add	x26,x26,x17			// h+=Sigma0(a)
574	add	x5,x5,x10
575	ldr	x10,[sp,#24]
576	str	x13,[sp,#16]
577	ror	x16,x22,#14
578	add	x25,x25,x19			// h+=K[i]
579	ror	x12,x7,#1
580	and	x17,x23,x22
581	ror	x11,x4,#19
582	bic	x19,x24,x22
583	ror	x13,x26,#28
584	add	x25,x25,x5			// h+=X[i]
585	eor	x16,x16,x22,ror#18
586	eor	x12,x12,x7,ror#8
587	orr	x17,x17,x19			// Ch(e,f,g)
588	eor	x19,x26,x27			// a^b, b^c in next round
589	eor	x16,x16,x22,ror#41	// Sigma1(e)
590	eor	x13,x13,x26,ror#34
591	add	x25,x25,x17			// h+=Ch(e,f,g)
592	and	x28,x28,x19			// (b^c)&=(a^b)
593	eor	x11,x11,x4,ror#61
594	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
595	add	x25,x25,x16			// h+=Sigma1(e)
596	eor	x28,x28,x27			// Maj(a,b,c)
597	eor	x17,x13,x26,ror#39	// Sigma0(a)
598	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
599	add	x6,x6,x15
600	add	x21,x21,x25			// d+=h
601	add	x25,x25,x28			// h+=Maj(a,b,c)
602	ldr	x28,[x30],#8		// *K++, x19 in next round
603	add	x6,x6,x12
604	add	x25,x25,x17			// h+=Sigma0(a)
605	add	x6,x6,x11
606	ldr	x11,[sp,#0]
607	str	x14,[sp,#24]
608	ror	x16,x21,#14
609	add	x24,x24,x28			// h+=K[i]
610	ror	x13,x8,#1
611	and	x17,x22,x21
612	ror	x12,x5,#19
613	bic	x28,x23,x21
614	ror	x14,x25,#28
615	add	x24,x24,x6			// h+=X[i]
616	eor	x16,x16,x21,ror#18
617	eor	x13,x13,x8,ror#8
618	orr	x17,x17,x28			// Ch(e,f,g)
619	eor	x28,x25,x26			// a^b, b^c in next round
620	eor	x16,x16,x21,ror#41	// Sigma1(e)
621	eor	x14,x14,x25,ror#34
622	add	x24,x24,x17			// h+=Ch(e,f,g)
623	and	x19,x19,x28			// (b^c)&=(a^b)
624	eor	x12,x12,x5,ror#61
625	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
626	add	x24,x24,x16			// h+=Sigma1(e)
627	eor	x19,x19,x26			// Maj(a,b,c)
628	eor	x17,x14,x25,ror#39	// Sigma0(a)
629	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
630	add	x7,x7,x0
631	add	x20,x20,x24			// d+=h
632	add	x24,x24,x19			// h+=Maj(a,b,c)
633	ldr	x19,[x30],#8		// *K++, x28 in next round
634	add	x7,x7,x13
635	add	x24,x24,x17			// h+=Sigma0(a)
636	add	x7,x7,x12
637	ldr	x12,[sp,#8]
638	str	x15,[sp,#0]
639	ror	x16,x20,#14
640	add	x23,x23,x19			// h+=K[i]
641	ror	x14,x9,#1
642	and	x17,x21,x20
643	ror	x13,x6,#19
644	bic	x19,x22,x20
645	ror	x15,x24,#28
646	add	x23,x23,x7			// h+=X[i]
647	eor	x16,x16,x20,ror#18
648	eor	x14,x14,x9,ror#8
649	orr	x17,x17,x19			// Ch(e,f,g)
650	eor	x19,x24,x25			// a^b, b^c in next round
651	eor	x16,x16,x20,ror#41	// Sigma1(e)
652	eor	x15,x15,x24,ror#34
653	add	x23,x23,x17			// h+=Ch(e,f,g)
654	and	x28,x28,x19			// (b^c)&=(a^b)
655	eor	x13,x13,x6,ror#61
656	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
657	add	x23,x23,x16			// h+=Sigma1(e)
658	eor	x28,x28,x25			// Maj(a,b,c)
659	eor	x17,x15,x24,ror#39	// Sigma0(a)
660	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
661	add	x8,x8,x1
662	add	x27,x27,x23			// d+=h
663	add	x23,x23,x28			// h+=Maj(a,b,c)
664	ldr	x28,[x30],#8		// *K++, x19 in next round
665	add	x8,x8,x14
666	add	x23,x23,x17			// h+=Sigma0(a)
667	add	x8,x8,x13
668	ldr	x13,[sp,#16]
669	str	x0,[sp,#8]
670	ror	x16,x27,#14
671	add	x22,x22,x28			// h+=K[i]
672	ror	x15,x10,#1
673	and	x17,x20,x27
674	ror	x14,x7,#19
675	bic	x28,x21,x27
676	ror	x0,x23,#28
677	add	x22,x22,x8			// h+=X[i]
678	eor	x16,x16,x27,ror#18
679	eor	x15,x15,x10,ror#8
680	orr	x17,x17,x28			// Ch(e,f,g)
681	eor	x28,x23,x24			// a^b, b^c in next round
682	eor	x16,x16,x27,ror#41	// Sigma1(e)
683	eor	x0,x0,x23,ror#34
684	add	x22,x22,x17			// h+=Ch(e,f,g)
685	and	x19,x19,x28			// (b^c)&=(a^b)
686	eor	x14,x14,x7,ror#61
687	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
688	add	x22,x22,x16			// h+=Sigma1(e)
689	eor	x19,x19,x24			// Maj(a,b,c)
690	eor	x17,x0,x23,ror#39	// Sigma0(a)
691	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
692	add	x9,x9,x2
693	add	x26,x26,x22			// d+=h
694	add	x22,x22,x19			// h+=Maj(a,b,c)
695	ldr	x19,[x30],#8		// *K++, x28 in next round
696	add	x9,x9,x15
697	add	x22,x22,x17			// h+=Sigma0(a)
698	add	x9,x9,x14
699	ldr	x14,[sp,#24]
700	str	x1,[sp,#16]
701	ror	x16,x26,#14
702	add	x21,x21,x19			// h+=K[i]
703	ror	x0,x11,#1
704	and	x17,x27,x26
705	ror	x15,x8,#19
706	bic	x19,x20,x26
707	ror	x1,x22,#28
708	add	x21,x21,x9			// h+=X[i]
709	eor	x16,x16,x26,ror#18
710	eor	x0,x0,x11,ror#8
711	orr	x17,x17,x19			// Ch(e,f,g)
712	eor	x19,x22,x23			// a^b, b^c in next round
713	eor	x16,x16,x26,ror#41	// Sigma1(e)
714	eor	x1,x1,x22,ror#34
715	add	x21,x21,x17			// h+=Ch(e,f,g)
716	and	x28,x28,x19			// (b^c)&=(a^b)
717	eor	x15,x15,x8,ror#61
718	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
719	add	x21,x21,x16			// h+=Sigma1(e)
720	eor	x28,x28,x23			// Maj(a,b,c)
721	eor	x17,x1,x22,ror#39	// Sigma0(a)
722	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
723	add	x10,x10,x3
724	add	x25,x25,x21			// d+=h
725	add	x21,x21,x28			// h+=Maj(a,b,c)
726	ldr	x28,[x30],#8		// *K++, x19 in next round
727	add	x10,x10,x0
728	add	x21,x21,x17			// h+=Sigma0(a)
729	add	x10,x10,x15
730	ldr	x15,[sp,#0]
731	str	x2,[sp,#24]
732	ror	x16,x25,#14
733	add	x20,x20,x28			// h+=K[i]
734	ror	x1,x12,#1
735	and	x17,x26,x25
736	ror	x0,x9,#19
737	bic	x28,x27,x25
738	ror	x2,x21,#28
739	add	x20,x20,x10			// h+=X[i]
740	eor	x16,x16,x25,ror#18
741	eor	x1,x1,x12,ror#8
742	orr	x17,x17,x28			// Ch(e,f,g)
743	eor	x28,x21,x22			// a^b, b^c in next round
744	eor	x16,x16,x25,ror#41	// Sigma1(e)
745	eor	x2,x2,x21,ror#34
746	add	x20,x20,x17			// h+=Ch(e,f,g)
747	and	x19,x19,x28			// (b^c)&=(a^b)
748	eor	x0,x0,x9,ror#61
749	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
750	add	x20,x20,x16			// h+=Sigma1(e)
751	eor	x19,x19,x22			// Maj(a,b,c)
752	eor	x17,x2,x21,ror#39	// Sigma0(a)
753	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
754	add	x11,x11,x4
755	add	x24,x24,x20			// d+=h
756	add	x20,x20,x19			// h+=Maj(a,b,c)
757	ldr	x19,[x30],#8		// *K++, x28 in next round
758	add	x11,x11,x1
759	add	x20,x20,x17			// h+=Sigma0(a)
760	add	x11,x11,x0
761	ldr	x0,[sp,#8]
762	str	x3,[sp,#0]
763	ror	x16,x24,#14
764	add	x27,x27,x19			// h+=K[i]
765	ror	x2,x13,#1
766	and	x17,x25,x24
767	ror	x1,x10,#19
768	bic	x19,x26,x24
769	ror	x3,x20,#28
770	add	x27,x27,x11			// h+=X[i]
771	eor	x16,x16,x24,ror#18
772	eor	x2,x2,x13,ror#8
773	orr	x17,x17,x19			// Ch(e,f,g)
774	eor	x19,x20,x21			// a^b, b^c in next round
775	eor	x16,x16,x24,ror#41	// Sigma1(e)
776	eor	x3,x3,x20,ror#34
777	add	x27,x27,x17			// h+=Ch(e,f,g)
778	and	x28,x28,x19			// (b^c)&=(a^b)
779	eor	x1,x1,x10,ror#61
780	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
781	add	x27,x27,x16			// h+=Sigma1(e)
782	eor	x28,x28,x21			// Maj(a,b,c)
783	eor	x17,x3,x20,ror#39	// Sigma0(a)
784	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
785	add	x12,x12,x5
786	add	x23,x23,x27			// d+=h
787	add	x27,x27,x28			// h+=Maj(a,b,c)
788	ldr	x28,[x30],#8		// *K++, x19 in next round
789	add	x12,x12,x2
790	add	x27,x27,x17			// h+=Sigma0(a)
791	add	x12,x12,x1
792	ldr	x1,[sp,#16]
793	str	x4,[sp,#8]
794	ror	x16,x23,#14
795	add	x26,x26,x28			// h+=K[i]
796	ror	x3,x14,#1
797	and	x17,x24,x23
798	ror	x2,x11,#19
799	bic	x28,x25,x23
800	ror	x4,x27,#28
801	add	x26,x26,x12			// h+=X[i]
802	eor	x16,x16,x23,ror#18
803	eor	x3,x3,x14,ror#8
804	orr	x17,x17,x28			// Ch(e,f,g)
805	eor	x28,x27,x20			// a^b, b^c in next round
806	eor	x16,x16,x23,ror#41	// Sigma1(e)
807	eor	x4,x4,x27,ror#34
808	add	x26,x26,x17			// h+=Ch(e,f,g)
809	and	x19,x19,x28			// (b^c)&=(a^b)
810	eor	x2,x2,x11,ror#61
811	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
812	add	x26,x26,x16			// h+=Sigma1(e)
813	eor	x19,x19,x20			// Maj(a,b,c)
814	eor	x17,x4,x27,ror#39	// Sigma0(a)
815	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
816	add	x13,x13,x6
817	add	x22,x22,x26			// d+=h
818	add	x26,x26,x19			// h+=Maj(a,b,c)
819	ldr	x19,[x30],#8		// *K++, x28 in next round
820	add	x13,x13,x3
821	add	x26,x26,x17			// h+=Sigma0(a)
822	add	x13,x13,x2
823	ldr	x2,[sp,#24]
824	str	x5,[sp,#16]
825	ror	x16,x22,#14
826	add	x25,x25,x19			// h+=K[i]
827	ror	x4,x15,#1
828	and	x17,x23,x22
829	ror	x3,x12,#19
830	bic	x19,x24,x22
831	ror	x5,x26,#28
832	add	x25,x25,x13			// h+=X[i]
833	eor	x16,x16,x22,ror#18
834	eor	x4,x4,x15,ror#8
835	orr	x17,x17,x19			// Ch(e,f,g)
836	eor	x19,x26,x27			// a^b, b^c in next round
837	eor	x16,x16,x22,ror#41	// Sigma1(e)
838	eor	x5,x5,x26,ror#34
839	add	x25,x25,x17			// h+=Ch(e,f,g)
840	and	x28,x28,x19			// (b^c)&=(a^b)
841	eor	x3,x3,x12,ror#61
842	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
843	add	x25,x25,x16			// h+=Sigma1(e)
844	eor	x28,x28,x27			// Maj(a,b,c)
845	eor	x17,x5,x26,ror#39	// Sigma0(a)
846	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
847	add	x14,x14,x7
848	add	x21,x21,x25			// d+=h
849	add	x25,x25,x28			// h+=Maj(a,b,c)
850	ldr	x28,[x30],#8		// *K++, x19 in next round
851	add	x14,x14,x4
852	add	x25,x25,x17			// h+=Sigma0(a)
853	add	x14,x14,x3
854	ldr	x3,[sp,#0]
855	str	x6,[sp,#24]
856	ror	x16,x21,#14
857	add	x24,x24,x28			// h+=K[i]
858	ror	x5,x0,#1
859	and	x17,x22,x21
860	ror	x4,x13,#19
861	bic	x28,x23,x21
862	ror	x6,x25,#28
863	add	x24,x24,x14			// h+=X[i]
864	eor	x16,x16,x21,ror#18
865	eor	x5,x5,x0,ror#8
866	orr	x17,x17,x28			// Ch(e,f,g)
867	eor	x28,x25,x26			// a^b, b^c in next round
868	eor	x16,x16,x21,ror#41	// Sigma1(e)
869	eor	x6,x6,x25,ror#34
870	add	x24,x24,x17			// h+=Ch(e,f,g)
871	and	x19,x19,x28			// (b^c)&=(a^b)
872	eor	x4,x4,x13,ror#61
873	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
874	add	x24,x24,x16			// h+=Sigma1(e)
875	eor	x19,x19,x26			// Maj(a,b,c)
876	eor	x17,x6,x25,ror#39	// Sigma0(a)
877	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
878	add	x15,x15,x8
879	add	x20,x20,x24			// d+=h
880	add	x24,x24,x19			// h+=Maj(a,b,c)
881	ldr	x19,[x30],#8		// *K++, x28 in next round
882	add	x15,x15,x5
883	add	x24,x24,x17			// h+=Sigma0(a)
884	add	x15,x15,x4
885	ldr	x4,[sp,#8]
886	str	x7,[sp,#0]
887	ror	x16,x20,#14
888	add	x23,x23,x19			// h+=K[i]
889	ror	x6,x1,#1
890	and	x17,x21,x20
891	ror	x5,x14,#19
892	bic	x19,x22,x20
893	ror	x7,x24,#28
894	add	x23,x23,x15			// h+=X[i]
895	eor	x16,x16,x20,ror#18
896	eor	x6,x6,x1,ror#8
897	orr	x17,x17,x19			// Ch(e,f,g)
898	eor	x19,x24,x25			// a^b, b^c in next round
899	eor	x16,x16,x20,ror#41	// Sigma1(e)
900	eor	x7,x7,x24,ror#34
901	add	x23,x23,x17			// h+=Ch(e,f,g)
902	and	x28,x28,x19			// (b^c)&=(a^b)
903	eor	x5,x5,x14,ror#61
904	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
905	add	x23,x23,x16			// h+=Sigma1(e)
906	eor	x28,x28,x25			// Maj(a,b,c)
907	eor	x17,x7,x24,ror#39	// Sigma0(a)
908	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
909	add	x0,x0,x9
910	add	x27,x27,x23			// d+=h
911	add	x23,x23,x28			// h+=Maj(a,b,c)
912	ldr	x28,[x30],#8		// *K++, x19 in next round
913	add	x0,x0,x6
914	add	x23,x23,x17			// h+=Sigma0(a)
915	add	x0,x0,x5
916	ldr	x5,[sp,#16]
917	str	x8,[sp,#8]
918	ror	x16,x27,#14
919	add	x22,x22,x28			// h+=K[i]
920	ror	x7,x2,#1
921	and	x17,x20,x27
922	ror	x6,x15,#19
923	bic	x28,x21,x27
924	ror	x8,x23,#28
925	add	x22,x22,x0			// h+=X[i]
926	eor	x16,x16,x27,ror#18
927	eor	x7,x7,x2,ror#8
928	orr	x17,x17,x28			// Ch(e,f,g)
929	eor	x28,x23,x24			// a^b, b^c in next round
930	eor	x16,x16,x27,ror#41	// Sigma1(e)
931	eor	x8,x8,x23,ror#34
932	add	x22,x22,x17			// h+=Ch(e,f,g)
933	and	x19,x19,x28			// (b^c)&=(a^b)
934	eor	x6,x6,x15,ror#61
935	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
936	add	x22,x22,x16			// h+=Sigma1(e)
937	eor	x19,x19,x24			// Maj(a,b,c)
938	eor	x17,x8,x23,ror#39	// Sigma0(a)
939	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
940	add	x1,x1,x10
941	add	x26,x26,x22			// d+=h
942	add	x22,x22,x19			// h+=Maj(a,b,c)
943	ldr	x19,[x30],#8		// *K++, x28 in next round
944	add	x1,x1,x7
945	add	x22,x22,x17			// h+=Sigma0(a)
946	add	x1,x1,x6
947	ldr	x6,[sp,#24]
948	str	x9,[sp,#16]
949	ror	x16,x26,#14
950	add	x21,x21,x19			// h+=K[i]
951	ror	x8,x3,#1
952	and	x17,x27,x26
953	ror	x7,x0,#19
954	bic	x19,x20,x26
955	ror	x9,x22,#28
956	add	x21,x21,x1			// h+=X[i]
957	eor	x16,x16,x26,ror#18
958	eor	x8,x8,x3,ror#8
959	orr	x17,x17,x19			// Ch(e,f,g)
960	eor	x19,x22,x23			// a^b, b^c in next round
961	eor	x16,x16,x26,ror#41	// Sigma1(e)
962	eor	x9,x9,x22,ror#34
963	add	x21,x21,x17			// h+=Ch(e,f,g)
964	and	x28,x28,x19			// (b^c)&=(a^b)
965	eor	x7,x7,x0,ror#61
966	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
967	add	x21,x21,x16			// h+=Sigma1(e)
968	eor	x28,x28,x23			// Maj(a,b,c)
969	eor	x17,x9,x22,ror#39	// Sigma0(a)
970	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
971	add	x2,x2,x11
972	add	x25,x25,x21			// d+=h
973	add	x21,x21,x28			// h+=Maj(a,b,c)
974	ldr	x28,[x30],#8		// *K++, x19 in next round
975	add	x2,x2,x8
976	add	x21,x21,x17			// h+=Sigma0(a)
977	add	x2,x2,x7
978	ldr	x7,[sp,#0]
979	str	x10,[sp,#24]
980	ror	x16,x25,#14
981	add	x20,x20,x28			// h+=K[i]
982	ror	x9,x4,#1
983	and	x17,x26,x25
984	ror	x8,x1,#19
985	bic	x28,x27,x25
986	ror	x10,x21,#28
987	add	x20,x20,x2			// h+=X[i]
988	eor	x16,x16,x25,ror#18
989	eor	x9,x9,x4,ror#8
990	orr	x17,x17,x28			// Ch(e,f,g)
991	eor	x28,x21,x22			// a^b, b^c in next round
992	eor	x16,x16,x25,ror#41	// Sigma1(e)
993	eor	x10,x10,x21,ror#34
994	add	x20,x20,x17			// h+=Ch(e,f,g)
995	and	x19,x19,x28			// (b^c)&=(a^b)
996	eor	x8,x8,x1,ror#61
997	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
998	add	x20,x20,x16			// h+=Sigma1(e)
999	eor	x19,x19,x22			// Maj(a,b,c)
1000	eor	x17,x10,x21,ror#39	// Sigma0(a)
1001	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1002	add	x3,x3,x12
1003	add	x24,x24,x20			// d+=h
1004	add	x20,x20,x19			// h+=Maj(a,b,c)
1005	ldr	x19,[x30],#8		// *K++, x28 in next round
1006	add	x3,x3,x9
1007	add	x20,x20,x17			// h+=Sigma0(a)
1008	add	x3,x3,x8
1009	cbnz	x19,Loop_16_xx
1010
1011	ldp	x0,x2,[x29,#96]
1012	ldr	x1,[x29,#112]
1013	sub	x30,x30,#648		// rewind
1014
1015	ldp	x3,x4,[x0]
1016	ldp	x5,x6,[x0,#2*8]
1017	add	x1,x1,#14*8			// advance input pointer
1018	ldp	x7,x8,[x0,#4*8]
1019	add	x20,x20,x3
1020	ldp	x9,x10,[x0,#6*8]
1021	add	x21,x21,x4
1022	add	x22,x22,x5
1023	add	x23,x23,x6
1024	stp	x20,x21,[x0]
1025	add	x24,x24,x7
1026	add	x25,x25,x8
1027	stp	x22,x23,[x0,#2*8]
1028	add	x26,x26,x9
1029	add	x27,x27,x10
1030	cmp	x1,x2
1031	stp	x24,x25,[x0,#4*8]
1032	stp	x26,x27,[x0,#6*8]
1033	b.ne	Loop
1034
1035	ldp	x19,x20,[x29,#16]
1036	add	sp,sp,#4*8
1037	ldp	x21,x22,[x29,#32]
1038	ldp	x23,x24,[x29,#48]
1039	ldp	x25,x26,[x29,#64]
1040	ldp	x27,x28,[x29,#80]
1041	ldp	x29,x30,[sp],#128
1042	AARCH64_VALIDATE_LINK_REGISTER
1043	ret
1044
1045
1046.section	.rodata
1047.align	6
1048
1049LK512:
1050.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1051.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1052.quad	0x3956c25bf348b538,0x59f111f1b605d019
1053.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1054.quad	0xd807aa98a3030242,0x12835b0145706fbe
1055.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1056.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1057.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1058.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1059.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1060.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1061.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1062.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1063.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1064.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1065.quad	0x06ca6351e003826f,0x142929670a0e6e70
1066.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1067.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1068.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1069.quad	0x81c2c92e47edaee6,0x92722c851482353b
1070.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1071.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1072.quad	0xd192e819d6ef5218,0xd69906245565a910
1073.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1074.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1075.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1076.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1077.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1078.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1079.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1080.quad	0x90befffa23631e28,0xa4506cebde82bde9
1081.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1082.quad	0xca273eceea26619c,0xd186b8c721c0c207
1083.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1084.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1085.quad	0x113f9804bef90dae,0x1b710b35131c471b
1086.quad	0x28db77f523047d84,0x32caab7b40c72493
1087.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1088.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1089.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1090.quad	0	// terminator
1091
1092.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1093.align	2
1094.align	2
1095.text
1096#ifndef	__KERNEL__
1097.def sha512_block_armv8
1098   .type 32
1099.endef
1100.align	6
1101sha512_block_armv8:
1102Lv8_entry:
1103	stp	x29,x30,[sp,#-16]!
1104	add	x29,sp,#0
1105
1106	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1107	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1108
1109	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1110	adrp	x3,LK512
1111	add	x3,x3,:lo12:LK512
1112
1113	rev64	v16.16b,v16.16b
1114	rev64	v17.16b,v17.16b
1115	rev64	v18.16b,v18.16b
1116	rev64	v19.16b,v19.16b
1117	rev64	v20.16b,v20.16b
1118	rev64	v21.16b,v21.16b
1119	rev64	v22.16b,v22.16b
1120	rev64	v23.16b,v23.16b
1121	b	Loop_hw
1122
1123.align	4
1124Loop_hw:
1125	ld1	{v24.2d},[x3],#16
1126	subs	x2,x2,#1
1127	sub	x4,x1,#128
1128	orr	v26.16b,v0.16b,v0.16b			// offload
1129	orr	v27.16b,v1.16b,v1.16b
1130	orr	v28.16b,v2.16b,v2.16b
1131	orr	v29.16b,v3.16b,v3.16b
1132	csel	x1,x1,x4,ne			// conditional rewind
1133	add	v24.2d,v24.2d,v16.2d
1134	ld1	{v25.2d},[x3],#16
1135	ext	v24.16b,v24.16b,v24.16b,#8
1136	ext	v5.16b,v2.16b,v3.16b,#8
1137	ext	v6.16b,v1.16b,v2.16b,#8
1138	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1139.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1140	ext	v7.16b,v20.16b,v21.16b,#8
1141.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1142.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1143	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1144.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1145	add	v25.2d,v25.2d,v17.2d
1146	ld1	{v24.2d},[x3],#16
1147	ext	v25.16b,v25.16b,v25.16b,#8
1148	ext	v5.16b,v4.16b,v2.16b,#8
1149	ext	v6.16b,v0.16b,v4.16b,#8
1150	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1151.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1152	ext	v7.16b,v21.16b,v22.16b,#8
1153.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1154.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1155	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1156.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1157	add	v24.2d,v24.2d,v18.2d
1158	ld1	{v25.2d},[x3],#16
1159	ext	v24.16b,v24.16b,v24.16b,#8
1160	ext	v5.16b,v1.16b,v4.16b,#8
1161	ext	v6.16b,v3.16b,v1.16b,#8
1162	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1163.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1164	ext	v7.16b,v22.16b,v23.16b,#8
1165.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1166.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1167	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1168.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1169	add	v25.2d,v25.2d,v19.2d
1170	ld1	{v24.2d},[x3],#16
1171	ext	v25.16b,v25.16b,v25.16b,#8
1172	ext	v5.16b,v0.16b,v1.16b,#8
1173	ext	v6.16b,v2.16b,v0.16b,#8
1174	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1175.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1176	ext	v7.16b,v23.16b,v16.16b,#8
1177.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1178.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1179	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1180.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1181	add	v24.2d,v24.2d,v20.2d
1182	ld1	{v25.2d},[x3],#16
1183	ext	v24.16b,v24.16b,v24.16b,#8
1184	ext	v5.16b,v3.16b,v0.16b,#8
1185	ext	v6.16b,v4.16b,v3.16b,#8
1186	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1187.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1188	ext	v7.16b,v16.16b,v17.16b,#8
1189.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1190.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1191	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1192.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1193	add	v25.2d,v25.2d,v21.2d
1194	ld1	{v24.2d},[x3],#16
1195	ext	v25.16b,v25.16b,v25.16b,#8
1196	ext	v5.16b,v2.16b,v3.16b,#8
1197	ext	v6.16b,v1.16b,v2.16b,#8
1198	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1199.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1200	ext	v7.16b,v17.16b,v18.16b,#8
1201.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1202.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1203	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1204.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1205	add	v24.2d,v24.2d,v22.2d
1206	ld1	{v25.2d},[x3],#16
1207	ext	v24.16b,v24.16b,v24.16b,#8
1208	ext	v5.16b,v4.16b,v2.16b,#8
1209	ext	v6.16b,v0.16b,v4.16b,#8
1210	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1211.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1212	ext	v7.16b,v18.16b,v19.16b,#8
1213.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1214.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1215	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1216.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1217	add	v25.2d,v25.2d,v23.2d
1218	ld1	{v24.2d},[x3],#16
1219	ext	v25.16b,v25.16b,v25.16b,#8
1220	ext	v5.16b,v1.16b,v4.16b,#8
1221	ext	v6.16b,v3.16b,v1.16b,#8
1222	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1223.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1224	ext	v7.16b,v19.16b,v20.16b,#8
1225.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1226.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1227	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1228.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1229	add	v24.2d,v24.2d,v16.2d
1230	ld1	{v25.2d},[x3],#16
1231	ext	v24.16b,v24.16b,v24.16b,#8
1232	ext	v5.16b,v0.16b,v1.16b,#8
1233	ext	v6.16b,v2.16b,v0.16b,#8
1234	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1235.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1236	ext	v7.16b,v20.16b,v21.16b,#8
1237.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1238.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1239	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1240.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1241	add	v25.2d,v25.2d,v17.2d
1242	ld1	{v24.2d},[x3],#16
1243	ext	v25.16b,v25.16b,v25.16b,#8
1244	ext	v5.16b,v3.16b,v0.16b,#8
1245	ext	v6.16b,v4.16b,v3.16b,#8
1246	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1247.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1248	ext	v7.16b,v21.16b,v22.16b,#8
1249.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1250.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1251	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1252.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1253	add	v24.2d,v24.2d,v18.2d
1254	ld1	{v25.2d},[x3],#16
1255	ext	v24.16b,v24.16b,v24.16b,#8
1256	ext	v5.16b,v2.16b,v3.16b,#8
1257	ext	v6.16b,v1.16b,v2.16b,#8
1258	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1259.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1260	ext	v7.16b,v22.16b,v23.16b,#8
1261.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1262.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1263	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1264.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1265	add	v25.2d,v25.2d,v19.2d
1266	ld1	{v24.2d},[x3],#16
1267	ext	v25.16b,v25.16b,v25.16b,#8
1268	ext	v5.16b,v4.16b,v2.16b,#8
1269	ext	v6.16b,v0.16b,v4.16b,#8
1270	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1271.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1272	ext	v7.16b,v23.16b,v16.16b,#8
1273.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1274.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1275	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1276.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1277	add	v24.2d,v24.2d,v20.2d
1278	ld1	{v25.2d},[x3],#16
1279	ext	v24.16b,v24.16b,v24.16b,#8
1280	ext	v5.16b,v1.16b,v4.16b,#8
1281	ext	v6.16b,v3.16b,v1.16b,#8
1282	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1283.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1284	ext	v7.16b,v16.16b,v17.16b,#8
1285.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1286.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1287	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1288.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1289	add	v25.2d,v25.2d,v21.2d
1290	ld1	{v24.2d},[x3],#16
1291	ext	v25.16b,v25.16b,v25.16b,#8
1292	ext	v5.16b,v0.16b,v1.16b,#8
1293	ext	v6.16b,v2.16b,v0.16b,#8
1294	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1295.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1296	ext	v7.16b,v17.16b,v18.16b,#8
1297.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1298.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1299	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1300.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1301	add	v24.2d,v24.2d,v22.2d
1302	ld1	{v25.2d},[x3],#16
1303	ext	v24.16b,v24.16b,v24.16b,#8
1304	ext	v5.16b,v3.16b,v0.16b,#8
1305	ext	v6.16b,v4.16b,v3.16b,#8
1306	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1307.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1308	ext	v7.16b,v18.16b,v19.16b,#8
1309.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1310.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1311	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1312.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1313	add	v25.2d,v25.2d,v23.2d
1314	ld1	{v24.2d},[x3],#16
1315	ext	v25.16b,v25.16b,v25.16b,#8
1316	ext	v5.16b,v2.16b,v3.16b,#8
1317	ext	v6.16b,v1.16b,v2.16b,#8
1318	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1319.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1320	ext	v7.16b,v19.16b,v20.16b,#8
1321.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1322.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1323	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1324.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1325	add	v24.2d,v24.2d,v16.2d
1326	ld1	{v25.2d},[x3],#16
1327	ext	v24.16b,v24.16b,v24.16b,#8
1328	ext	v5.16b,v4.16b,v2.16b,#8
1329	ext	v6.16b,v0.16b,v4.16b,#8
1330	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1331.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1332	ext	v7.16b,v20.16b,v21.16b,#8
1333.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1334.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1335	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1336.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1337	add	v25.2d,v25.2d,v17.2d
1338	ld1	{v24.2d},[x3],#16
1339	ext	v25.16b,v25.16b,v25.16b,#8
1340	ext	v5.16b,v1.16b,v4.16b,#8
1341	ext	v6.16b,v3.16b,v1.16b,#8
1342	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1343.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1344	ext	v7.16b,v21.16b,v22.16b,#8
1345.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1346.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1347	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1348.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1349	add	v24.2d,v24.2d,v18.2d
1350	ld1	{v25.2d},[x3],#16
1351	ext	v24.16b,v24.16b,v24.16b,#8
1352	ext	v5.16b,v0.16b,v1.16b,#8
1353	ext	v6.16b,v2.16b,v0.16b,#8
1354	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1355.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1356	ext	v7.16b,v22.16b,v23.16b,#8
1357.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1358.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1359	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1360.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1361	add	v25.2d,v25.2d,v19.2d
1362	ld1	{v24.2d},[x3],#16
1363	ext	v25.16b,v25.16b,v25.16b,#8
1364	ext	v5.16b,v3.16b,v0.16b,#8
1365	ext	v6.16b,v4.16b,v3.16b,#8
1366	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1367.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1368	ext	v7.16b,v23.16b,v16.16b,#8
1369.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1370.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1371	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1372.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1373	add	v24.2d,v24.2d,v20.2d
1374	ld1	{v25.2d},[x3],#16
1375	ext	v24.16b,v24.16b,v24.16b,#8
1376	ext	v5.16b,v2.16b,v3.16b,#8
1377	ext	v6.16b,v1.16b,v2.16b,#8
1378	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1379.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1380	ext	v7.16b,v16.16b,v17.16b,#8
1381.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1382.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1383	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1384.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1385	add	v25.2d,v25.2d,v21.2d
1386	ld1	{v24.2d},[x3],#16
1387	ext	v25.16b,v25.16b,v25.16b,#8
1388	ext	v5.16b,v4.16b,v2.16b,#8
1389	ext	v6.16b,v0.16b,v4.16b,#8
1390	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1391.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1392	ext	v7.16b,v17.16b,v18.16b,#8
1393.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1394.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1395	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1396.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1397	add	v24.2d,v24.2d,v22.2d
1398	ld1	{v25.2d},[x3],#16
1399	ext	v24.16b,v24.16b,v24.16b,#8
1400	ext	v5.16b,v1.16b,v4.16b,#8
1401	ext	v6.16b,v3.16b,v1.16b,#8
1402	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1403.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1404	ext	v7.16b,v18.16b,v19.16b,#8
1405.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1406.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1407	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1408.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1409	add	v25.2d,v25.2d,v23.2d
1410	ld1	{v24.2d},[x3],#16
1411	ext	v25.16b,v25.16b,v25.16b,#8
1412	ext	v5.16b,v0.16b,v1.16b,#8
1413	ext	v6.16b,v2.16b,v0.16b,#8
1414	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1415.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1416	ext	v7.16b,v19.16b,v20.16b,#8
1417.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1418.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1419	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1420.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1421	add	v24.2d,v24.2d,v16.2d
1422	ld1	{v25.2d},[x3],#16
1423	ext	v24.16b,v24.16b,v24.16b,#8
1424	ext	v5.16b,v3.16b,v0.16b,#8
1425	ext	v6.16b,v4.16b,v3.16b,#8
1426	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1427.long	0xcec08230	//sha512su0 v16.16b,v17.16b
1428	ext	v7.16b,v20.16b,v21.16b,#8
1429.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1430.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1431	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1432.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1433	add	v25.2d,v25.2d,v17.2d
1434	ld1	{v24.2d},[x3],#16
1435	ext	v25.16b,v25.16b,v25.16b,#8
1436	ext	v5.16b,v2.16b,v3.16b,#8
1437	ext	v6.16b,v1.16b,v2.16b,#8
1438	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1439.long	0xcec08251	//sha512su0 v17.16b,v18.16b
1440	ext	v7.16b,v21.16b,v22.16b,#8
1441.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1442.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1443	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1444.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1445	add	v24.2d,v24.2d,v18.2d
1446	ld1	{v25.2d},[x3],#16
1447	ext	v24.16b,v24.16b,v24.16b,#8
1448	ext	v5.16b,v4.16b,v2.16b,#8
1449	ext	v6.16b,v0.16b,v4.16b,#8
1450	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1451.long	0xcec08272	//sha512su0 v18.16b,v19.16b
1452	ext	v7.16b,v22.16b,v23.16b,#8
1453.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1454.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1455	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1456.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1457	add	v25.2d,v25.2d,v19.2d
1458	ld1	{v24.2d},[x3],#16
1459	ext	v25.16b,v25.16b,v25.16b,#8
1460	ext	v5.16b,v1.16b,v4.16b,#8
1461	ext	v6.16b,v3.16b,v1.16b,#8
1462	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1463.long	0xcec08293	//sha512su0 v19.16b,v20.16b
1464	ext	v7.16b,v23.16b,v16.16b,#8
1465.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1466.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1467	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1468.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1469	add	v24.2d,v24.2d,v20.2d
1470	ld1	{v25.2d},[x3],#16
1471	ext	v24.16b,v24.16b,v24.16b,#8
1472	ext	v5.16b,v0.16b,v1.16b,#8
1473	ext	v6.16b,v2.16b,v0.16b,#8
1474	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1475.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
1476	ext	v7.16b,v16.16b,v17.16b,#8
1477.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1478.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1479	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1480.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1481	add	v25.2d,v25.2d,v21.2d
1482	ld1	{v24.2d},[x3],#16
1483	ext	v25.16b,v25.16b,v25.16b,#8
1484	ext	v5.16b,v3.16b,v0.16b,#8
1485	ext	v6.16b,v4.16b,v3.16b,#8
1486	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1487.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
1488	ext	v7.16b,v17.16b,v18.16b,#8
1489.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1490.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1491	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1492.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1493	add	v24.2d,v24.2d,v22.2d
1494	ld1	{v25.2d},[x3],#16
1495	ext	v24.16b,v24.16b,v24.16b,#8
1496	ext	v5.16b,v2.16b,v3.16b,#8
1497	ext	v6.16b,v1.16b,v2.16b,#8
1498	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1499.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
1500	ext	v7.16b,v18.16b,v19.16b,#8
1501.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1502.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1503	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1504.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1505	add	v25.2d,v25.2d,v23.2d
1506	ld1	{v24.2d},[x3],#16
1507	ext	v25.16b,v25.16b,v25.16b,#8
1508	ext	v5.16b,v4.16b,v2.16b,#8
1509	ext	v6.16b,v0.16b,v4.16b,#8
1510	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1511.long	0xcec08217	//sha512su0 v23.16b,v16.16b
1512	ext	v7.16b,v19.16b,v20.16b,#8
1513.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1514.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1515	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1516.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1517	ld1	{v25.2d},[x3],#16
1518	add	v24.2d,v24.2d,v16.2d
1519	ld1	{v16.16b},[x1],#16		// load next input
1520	ext	v24.16b,v24.16b,v24.16b,#8
1521	ext	v5.16b,v1.16b,v4.16b,#8
1522	ext	v6.16b,v3.16b,v1.16b,#8
1523	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1524.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1525	rev64	v16.16b,v16.16b
1526	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1527.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1528	ld1	{v24.2d},[x3],#16
1529	add	v25.2d,v25.2d,v17.2d
1530	ld1	{v17.16b},[x1],#16		// load next input
1531	ext	v25.16b,v25.16b,v25.16b,#8
1532	ext	v5.16b,v0.16b,v1.16b,#8
1533	ext	v6.16b,v2.16b,v0.16b,#8
1534	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1535.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1536	rev64	v17.16b,v17.16b
1537	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1538.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1539	ld1	{v25.2d},[x3],#16
1540	add	v24.2d,v24.2d,v18.2d
1541	ld1	{v18.16b},[x1],#16		// load next input
1542	ext	v24.16b,v24.16b,v24.16b,#8
1543	ext	v5.16b,v3.16b,v0.16b,#8
1544	ext	v6.16b,v4.16b,v3.16b,#8
1545	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1546.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1547	rev64	v18.16b,v18.16b
1548	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1549.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1550	ld1	{v24.2d},[x3],#16
1551	add	v25.2d,v25.2d,v19.2d
1552	ld1	{v19.16b},[x1],#16		// load next input
1553	ext	v25.16b,v25.16b,v25.16b,#8
1554	ext	v5.16b,v2.16b,v3.16b,#8
1555	ext	v6.16b,v1.16b,v2.16b,#8
1556	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1557.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1558	rev64	v19.16b,v19.16b
1559	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1560.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1561	ld1	{v25.2d},[x3],#16
1562	add	v24.2d,v24.2d,v20.2d
1563	ld1	{v20.16b},[x1],#16		// load next input
1564	ext	v24.16b,v24.16b,v24.16b,#8
1565	ext	v5.16b,v4.16b,v2.16b,#8
1566	ext	v6.16b,v0.16b,v4.16b,#8
1567	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1568.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1569	rev64	v20.16b,v20.16b
1570	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1571.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1572	ld1	{v24.2d},[x3],#16
1573	add	v25.2d,v25.2d,v21.2d
1574	ld1	{v21.16b},[x1],#16		// load next input
1575	ext	v25.16b,v25.16b,v25.16b,#8
1576	ext	v5.16b,v1.16b,v4.16b,#8
1577	ext	v6.16b,v3.16b,v1.16b,#8
1578	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1579.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1580	rev64	v21.16b,v21.16b
1581	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1582.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1583	ld1	{v25.2d},[x3],#16
1584	add	v24.2d,v24.2d,v22.2d
1585	ld1	{v22.16b},[x1],#16		// load next input
1586	ext	v24.16b,v24.16b,v24.16b,#8
1587	ext	v5.16b,v0.16b,v1.16b,#8
1588	ext	v6.16b,v2.16b,v0.16b,#8
1589	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1590.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1591	rev64	v22.16b,v22.16b
1592	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1593.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1594	sub	x3,x3,#80*8	// rewind
1595	add	v25.2d,v25.2d,v23.2d
1596	ld1	{v23.16b},[x1],#16		// load next input
1597	ext	v25.16b,v25.16b,v25.16b,#8
1598	ext	v5.16b,v3.16b,v0.16b,#8
1599	ext	v6.16b,v4.16b,v3.16b,#8
1600	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1601.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1602	rev64	v23.16b,v23.16b
1603	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1604.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1605	add	v0.2d,v0.2d,v26.2d			// accumulate
1606	add	v1.2d,v1.2d,v27.2d
1607	add	v2.2d,v2.2d,v28.2d
1608	add	v3.2d,v3.2d,v29.2d
1609
1610	cbnz	x2,Loop_hw
1611
1612	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1613
1614	ldr	x29,[sp],#16
1615	ret
1616
1617#endif
1618#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(_WIN32)
1619#if defined(__ELF__)
1620// See https://www.airs.com/blog/archives/518.
1621.section .note.GNU-stack,"",%progbits
1622#endif
1623