• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
2//
3// Licensed under the Apache License 2.0 (the "License").  You may not use
4// this file except in compliance with the License.  You can obtain a copy
5// in the file LICENSE in the source distribution or at
6// https://www.openssl.org/source/license.html
7
8// ====================================================================
9// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10// project. The module is, however, dual licensed under OpenSSL and
11// CRYPTOGAMS licenses depending on where you obtain it. For further
12// details see http://www.openssl.org/~appro/cryptogams/.
13//
14// Permission to use under GPLv2 terms is granted.
15// ====================================================================
16//
17// SHA256/512 for ARMv8.
18//
19// Performance in cycles per processed byte and improvement coefficient
20// over code generated with "default" compiler:
21//
22//		SHA256-hw	SHA256(*)	SHA512
23// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
24// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
25// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
26// Denver	2.01		10.5 (+26%)	6.70 (+8%)
27// X-Gene			20.0 (+100%)	12.8 (+300%(***))
28// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
29// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
30// ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
31//
32// (*)	Software SHA256 results are of lesser relevance, presented
33//	mostly for informational purposes.
34// (**)	The result is a trade-off: it's possible to improve it by
35//	10% (or by 1 cycle per round), but at the cost of 20% loss
36//	on Cortex-A53 (or by 4 cycles per round).
37// (***)	Super-impressive coefficients over gcc-generated code are
38//	indication of some compiler "pathology", most notably code
39//	generated with -mgeneral-regs-only is significantly faster
40//	and the gap is only 40-90%.
41//
42// October 2016.
43//
44// Originally it was reckoned that it makes no sense to implement NEON
45// version of SHA256 for 64-bit processors. This is because performance
46// improvement on most wide-spread Cortex-A5x processors was observed
47// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
48// observed that 32-bit NEON SHA256 performs significantly better than
49// 64-bit scalar version on *some* of the more recent processors. As
50// result 64-bit NEON version of SHA256 was added to provide best
51// all-round performance. For example it executes ~30% faster on X-Gene
52// and Mongoose. [For reference, NEON version of SHA512 is bound to
53// deliver much less improvement, likely *negative* on Cortex-A5x.
54// Which is why NEON support is limited to SHA256.]
55
56// $output is the last argument if it looks like a file (it has an extension)
57// $flavour is the first argument if it doesn't look like a file
58#ifndef	__KERNEL__
59# include "arm_arch.h"
60
61.hidden	OPENSSL_armcap_P
62#endif
63
64.text
65
66.globl	sha512_block_data_order
67.type	sha512_block_data_order,%function
68.align	6
69sha512_block_data_order:
70#ifndef	__KERNEL__
71	adrp	x16,OPENSSL_armcap_P
72	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
73	tst	w16,#ARMV8_SHA512
74	b.ne	.Lv8_entry
75#endif
76.inst	0xd503233f				// paciasp
77	stp	x29,x30,[sp,#-128]!
78	add	x29,sp,#0
79
80	stp	x19,x20,[sp,#16]
81	stp	x21,x22,[sp,#32]
82	stp	x23,x24,[sp,#48]
83	stp	x25,x26,[sp,#64]
84	stp	x27,x28,[sp,#80]
85	sub	sp,sp,#4*8
86
87	ldp	x20,x21,[x0]				// load context
88	ldp	x22,x23,[x0,#2*8]
89	ldp	x24,x25,[x0,#4*8]
90	add	x2,x1,x2,lsl#7	// end of input
91	ldp	x26,x27,[x0,#6*8]
92	adr	x30,.LK512
93	stp	x0,x2,[x29,#96]
94
95.Loop:
96	ldp	x3,x4,[x1],#2*8
97	ldr	x19,[x30],#8			// *K++
98	eor	x28,x21,x22				// magic seed
99	str	x1,[x29,#112]
100#ifndef	__AARCH64EB__
101	rev	x3,x3			// 0
102#endif
103	ror	x16,x24,#14
104	add	x27,x27,x19			// h+=K[i]
105	eor	x6,x24,x24,ror#23
106	and	x17,x25,x24
107	bic	x19,x26,x24
108	add	x27,x27,x3			// h+=X[i]
109	orr	x17,x17,x19			// Ch(e,f,g)
110	eor	x19,x20,x21			// a^b, b^c in next round
111	eor	x16,x16,x6,ror#18	// Sigma1(e)
112	ror	x6,x20,#28
113	add	x27,x27,x17			// h+=Ch(e,f,g)
114	eor	x17,x20,x20,ror#5
115	add	x27,x27,x16			// h+=Sigma1(e)
116	and	x28,x28,x19			// (b^c)&=(a^b)
117	add	x23,x23,x27			// d+=h
118	eor	x28,x28,x21			// Maj(a,b,c)
119	eor	x17,x6,x17,ror#34	// Sigma0(a)
120	add	x27,x27,x28			// h+=Maj(a,b,c)
121	ldr	x28,[x30],#8		// *K++, x19 in next round
122	//add	x27,x27,x17			// h+=Sigma0(a)
123#ifndef	__AARCH64EB__
124	rev	x4,x4			// 1
125#endif
126	ldp	x5,x6,[x1],#2*8
127	add	x27,x27,x17			// h+=Sigma0(a)
128	ror	x16,x23,#14
129	add	x26,x26,x28			// h+=K[i]
130	eor	x7,x23,x23,ror#23
131	and	x17,x24,x23
132	bic	x28,x25,x23
133	add	x26,x26,x4			// h+=X[i]
134	orr	x17,x17,x28			// Ch(e,f,g)
135	eor	x28,x27,x20			// a^b, b^c in next round
136	eor	x16,x16,x7,ror#18	// Sigma1(e)
137	ror	x7,x27,#28
138	add	x26,x26,x17			// h+=Ch(e,f,g)
139	eor	x17,x27,x27,ror#5
140	add	x26,x26,x16			// h+=Sigma1(e)
141	and	x19,x19,x28			// (b^c)&=(a^b)
142	add	x22,x22,x26			// d+=h
143	eor	x19,x19,x20			// Maj(a,b,c)
144	eor	x17,x7,x17,ror#34	// Sigma0(a)
145	add	x26,x26,x19			// h+=Maj(a,b,c)
146	ldr	x19,[x30],#8		// *K++, x28 in next round
147	//add	x26,x26,x17			// h+=Sigma0(a)
148#ifndef	__AARCH64EB__
149	rev	x5,x5			// 2
150#endif
151	add	x26,x26,x17			// h+=Sigma0(a)
152	ror	x16,x22,#14
153	add	x25,x25,x19			// h+=K[i]
154	eor	x8,x22,x22,ror#23
155	and	x17,x23,x22
156	bic	x19,x24,x22
157	add	x25,x25,x5			// h+=X[i]
158	orr	x17,x17,x19			// Ch(e,f,g)
159	eor	x19,x26,x27			// a^b, b^c in next round
160	eor	x16,x16,x8,ror#18	// Sigma1(e)
161	ror	x8,x26,#28
162	add	x25,x25,x17			// h+=Ch(e,f,g)
163	eor	x17,x26,x26,ror#5
164	add	x25,x25,x16			// h+=Sigma1(e)
165	and	x28,x28,x19			// (b^c)&=(a^b)
166	add	x21,x21,x25			// d+=h
167	eor	x28,x28,x27			// Maj(a,b,c)
168	eor	x17,x8,x17,ror#34	// Sigma0(a)
169	add	x25,x25,x28			// h+=Maj(a,b,c)
170	ldr	x28,[x30],#8		// *K++, x19 in next round
171	//add	x25,x25,x17			// h+=Sigma0(a)
172#ifndef	__AARCH64EB__
173	rev	x6,x6			// 3
174#endif
175	ldp	x7,x8,[x1],#2*8
176	add	x25,x25,x17			// h+=Sigma0(a)
177	ror	x16,x21,#14
178	add	x24,x24,x28			// h+=K[i]
179	eor	x9,x21,x21,ror#23
180	and	x17,x22,x21
181	bic	x28,x23,x21
182	add	x24,x24,x6			// h+=X[i]
183	orr	x17,x17,x28			// Ch(e,f,g)
184	eor	x28,x25,x26			// a^b, b^c in next round
185	eor	x16,x16,x9,ror#18	// Sigma1(e)
186	ror	x9,x25,#28
187	add	x24,x24,x17			// h+=Ch(e,f,g)
188	eor	x17,x25,x25,ror#5
189	add	x24,x24,x16			// h+=Sigma1(e)
190	and	x19,x19,x28			// (b^c)&=(a^b)
191	add	x20,x20,x24			// d+=h
192	eor	x19,x19,x26			// Maj(a,b,c)
193	eor	x17,x9,x17,ror#34	// Sigma0(a)
194	add	x24,x24,x19			// h+=Maj(a,b,c)
195	ldr	x19,[x30],#8		// *K++, x28 in next round
196	//add	x24,x24,x17			// h+=Sigma0(a)
197#ifndef	__AARCH64EB__
198	rev	x7,x7			// 4
199#endif
200	add	x24,x24,x17			// h+=Sigma0(a)
201	ror	x16,x20,#14
202	add	x23,x23,x19			// h+=K[i]
203	eor	x10,x20,x20,ror#23
204	and	x17,x21,x20
205	bic	x19,x22,x20
206	add	x23,x23,x7			// h+=X[i]
207	orr	x17,x17,x19			// Ch(e,f,g)
208	eor	x19,x24,x25			// a^b, b^c in next round
209	eor	x16,x16,x10,ror#18	// Sigma1(e)
210	ror	x10,x24,#28
211	add	x23,x23,x17			// h+=Ch(e,f,g)
212	eor	x17,x24,x24,ror#5
213	add	x23,x23,x16			// h+=Sigma1(e)
214	and	x28,x28,x19			// (b^c)&=(a^b)
215	add	x27,x27,x23			// d+=h
216	eor	x28,x28,x25			// Maj(a,b,c)
217	eor	x17,x10,x17,ror#34	// Sigma0(a)
218	add	x23,x23,x28			// h+=Maj(a,b,c)
219	ldr	x28,[x30],#8		// *K++, x19 in next round
220	//add	x23,x23,x17			// h+=Sigma0(a)
221#ifndef	__AARCH64EB__
222	rev	x8,x8			// 5
223#endif
224	ldp	x9,x10,[x1],#2*8
225	add	x23,x23,x17			// h+=Sigma0(a)
226	ror	x16,x27,#14
227	add	x22,x22,x28			// h+=K[i]
228	eor	x11,x27,x27,ror#23
229	and	x17,x20,x27
230	bic	x28,x21,x27
231	add	x22,x22,x8			// h+=X[i]
232	orr	x17,x17,x28			// Ch(e,f,g)
233	eor	x28,x23,x24			// a^b, b^c in next round
234	eor	x16,x16,x11,ror#18	// Sigma1(e)
235	ror	x11,x23,#28
236	add	x22,x22,x17			// h+=Ch(e,f,g)
237	eor	x17,x23,x23,ror#5
238	add	x22,x22,x16			// h+=Sigma1(e)
239	and	x19,x19,x28			// (b^c)&=(a^b)
240	add	x26,x26,x22			// d+=h
241	eor	x19,x19,x24			// Maj(a,b,c)
242	eor	x17,x11,x17,ror#34	// Sigma0(a)
243	add	x22,x22,x19			// h+=Maj(a,b,c)
244	ldr	x19,[x30],#8		// *K++, x28 in next round
245	//add	x22,x22,x17			// h+=Sigma0(a)
246#ifndef	__AARCH64EB__
247	rev	x9,x9			// 6
248#endif
249	add	x22,x22,x17			// h+=Sigma0(a)
250	ror	x16,x26,#14
251	add	x21,x21,x19			// h+=K[i]
252	eor	x12,x26,x26,ror#23
253	and	x17,x27,x26
254	bic	x19,x20,x26
255	add	x21,x21,x9			// h+=X[i]
256	orr	x17,x17,x19			// Ch(e,f,g)
257	eor	x19,x22,x23			// a^b, b^c in next round
258	eor	x16,x16,x12,ror#18	// Sigma1(e)
259	ror	x12,x22,#28
260	add	x21,x21,x17			// h+=Ch(e,f,g)
261	eor	x17,x22,x22,ror#5
262	add	x21,x21,x16			// h+=Sigma1(e)
263	and	x28,x28,x19			// (b^c)&=(a^b)
264	add	x25,x25,x21			// d+=h
265	eor	x28,x28,x23			// Maj(a,b,c)
266	eor	x17,x12,x17,ror#34	// Sigma0(a)
267	add	x21,x21,x28			// h+=Maj(a,b,c)
268	ldr	x28,[x30],#8		// *K++, x19 in next round
269	//add	x21,x21,x17			// h+=Sigma0(a)
270#ifndef	__AARCH64EB__
271	rev	x10,x10			// 7
272#endif
273	ldp	x11,x12,[x1],#2*8
274	add	x21,x21,x17			// h+=Sigma0(a)
275	ror	x16,x25,#14
276	add	x20,x20,x28			// h+=K[i]
277	eor	x13,x25,x25,ror#23
278	and	x17,x26,x25
279	bic	x28,x27,x25
280	add	x20,x20,x10			// h+=X[i]
281	orr	x17,x17,x28			// Ch(e,f,g)
282	eor	x28,x21,x22			// a^b, b^c in next round
283	eor	x16,x16,x13,ror#18	// Sigma1(e)
284	ror	x13,x21,#28
285	add	x20,x20,x17			// h+=Ch(e,f,g)
286	eor	x17,x21,x21,ror#5
287	add	x20,x20,x16			// h+=Sigma1(e)
288	and	x19,x19,x28			// (b^c)&=(a^b)
289	add	x24,x24,x20			// d+=h
290	eor	x19,x19,x22			// Maj(a,b,c)
291	eor	x17,x13,x17,ror#34	// Sigma0(a)
292	add	x20,x20,x19			// h+=Maj(a,b,c)
293	ldr	x19,[x30],#8		// *K++, x28 in next round
294	//add	x20,x20,x17			// h+=Sigma0(a)
295#ifndef	__AARCH64EB__
296	rev	x11,x11			// 8
297#endif
298	add	x20,x20,x17			// h+=Sigma0(a)
299	ror	x16,x24,#14
300	add	x27,x27,x19			// h+=K[i]
301	eor	x14,x24,x24,ror#23
302	and	x17,x25,x24
303	bic	x19,x26,x24
304	add	x27,x27,x11			// h+=X[i]
305	orr	x17,x17,x19			// Ch(e,f,g)
306	eor	x19,x20,x21			// a^b, b^c in next round
307	eor	x16,x16,x14,ror#18	// Sigma1(e)
308	ror	x14,x20,#28
309	add	x27,x27,x17			// h+=Ch(e,f,g)
310	eor	x17,x20,x20,ror#5
311	add	x27,x27,x16			// h+=Sigma1(e)
312	and	x28,x28,x19			// (b^c)&=(a^b)
313	add	x23,x23,x27			// d+=h
314	eor	x28,x28,x21			// Maj(a,b,c)
315	eor	x17,x14,x17,ror#34	// Sigma0(a)
316	add	x27,x27,x28			// h+=Maj(a,b,c)
317	ldr	x28,[x30],#8		// *K++, x19 in next round
318	//add	x27,x27,x17			// h+=Sigma0(a)
319#ifndef	__AARCH64EB__
320	rev	x12,x12			// 9
321#endif
322	ldp	x13,x14,[x1],#2*8
323	add	x27,x27,x17			// h+=Sigma0(a)
324	ror	x16,x23,#14
325	add	x26,x26,x28			// h+=K[i]
326	eor	x15,x23,x23,ror#23
327	and	x17,x24,x23
328	bic	x28,x25,x23
329	add	x26,x26,x12			// h+=X[i]
330	orr	x17,x17,x28			// Ch(e,f,g)
331	eor	x28,x27,x20			// a^b, b^c in next round
332	eor	x16,x16,x15,ror#18	// Sigma1(e)
333	ror	x15,x27,#28
334	add	x26,x26,x17			// h+=Ch(e,f,g)
335	eor	x17,x27,x27,ror#5
336	add	x26,x26,x16			// h+=Sigma1(e)
337	and	x19,x19,x28			// (b^c)&=(a^b)
338	add	x22,x22,x26			// d+=h
339	eor	x19,x19,x20			// Maj(a,b,c)
340	eor	x17,x15,x17,ror#34	// Sigma0(a)
341	add	x26,x26,x19			// h+=Maj(a,b,c)
342	ldr	x19,[x30],#8		// *K++, x28 in next round
343	//add	x26,x26,x17			// h+=Sigma0(a)
344#ifndef	__AARCH64EB__
345	rev	x13,x13			// 10
346#endif
347	add	x26,x26,x17			// h+=Sigma0(a)
348	ror	x16,x22,#14
349	add	x25,x25,x19			// h+=K[i]
350	eor	x0,x22,x22,ror#23
351	and	x17,x23,x22
352	bic	x19,x24,x22
353	add	x25,x25,x13			// h+=X[i]
354	orr	x17,x17,x19			// Ch(e,f,g)
355	eor	x19,x26,x27			// a^b, b^c in next round
356	eor	x16,x16,x0,ror#18	// Sigma1(e)
357	ror	x0,x26,#28
358	add	x25,x25,x17			// h+=Ch(e,f,g)
359	eor	x17,x26,x26,ror#5
360	add	x25,x25,x16			// h+=Sigma1(e)
361	and	x28,x28,x19			// (b^c)&=(a^b)
362	add	x21,x21,x25			// d+=h
363	eor	x28,x28,x27			// Maj(a,b,c)
364	eor	x17,x0,x17,ror#34	// Sigma0(a)
365	add	x25,x25,x28			// h+=Maj(a,b,c)
366	ldr	x28,[x30],#8		// *K++, x19 in next round
367	//add	x25,x25,x17			// h+=Sigma0(a)
368#ifndef	__AARCH64EB__
369	rev	x14,x14			// 11
370#endif
371	ldp	x15,x0,[x1],#2*8
372	add	x25,x25,x17			// h+=Sigma0(a)
373	str	x6,[sp,#24]
374	ror	x16,x21,#14
375	add	x24,x24,x28			// h+=K[i]
376	eor	x6,x21,x21,ror#23
377	and	x17,x22,x21
378	bic	x28,x23,x21
379	add	x24,x24,x14			// h+=X[i]
380	orr	x17,x17,x28			// Ch(e,f,g)
381	eor	x28,x25,x26			// a^b, b^c in next round
382	eor	x16,x16,x6,ror#18	// Sigma1(e)
383	ror	x6,x25,#28
384	add	x24,x24,x17			// h+=Ch(e,f,g)
385	eor	x17,x25,x25,ror#5
386	add	x24,x24,x16			// h+=Sigma1(e)
387	and	x19,x19,x28			// (b^c)&=(a^b)
388	add	x20,x20,x24			// d+=h
389	eor	x19,x19,x26			// Maj(a,b,c)
390	eor	x17,x6,x17,ror#34	// Sigma0(a)
391	add	x24,x24,x19			// h+=Maj(a,b,c)
392	ldr	x19,[x30],#8		// *K++, x28 in next round
393	//add	x24,x24,x17			// h+=Sigma0(a)
394#ifndef	__AARCH64EB__
395	rev	x15,x15			// 12
396#endif
397	add	x24,x24,x17			// h+=Sigma0(a)
398	str	x7,[sp,#0]
399	ror	x16,x20,#14
400	add	x23,x23,x19			// h+=K[i]
401	eor	x7,x20,x20,ror#23
402	and	x17,x21,x20
403	bic	x19,x22,x20
404	add	x23,x23,x15			// h+=X[i]
405	orr	x17,x17,x19			// Ch(e,f,g)
406	eor	x19,x24,x25			// a^b, b^c in next round
407	eor	x16,x16,x7,ror#18	// Sigma1(e)
408	ror	x7,x24,#28
409	add	x23,x23,x17			// h+=Ch(e,f,g)
410	eor	x17,x24,x24,ror#5
411	add	x23,x23,x16			// h+=Sigma1(e)
412	and	x28,x28,x19			// (b^c)&=(a^b)
413	add	x27,x27,x23			// d+=h
414	eor	x28,x28,x25			// Maj(a,b,c)
415	eor	x17,x7,x17,ror#34	// Sigma0(a)
416	add	x23,x23,x28			// h+=Maj(a,b,c)
417	ldr	x28,[x30],#8		// *K++, x19 in next round
418	//add	x23,x23,x17			// h+=Sigma0(a)
419#ifndef	__AARCH64EB__
420	rev	x0,x0			// 13
421#endif
422	ldp	x1,x2,[x1]
423	add	x23,x23,x17			// h+=Sigma0(a)
424	str	x8,[sp,#8]
425	ror	x16,x27,#14
426	add	x22,x22,x28			// h+=K[i]
427	eor	x8,x27,x27,ror#23
428	and	x17,x20,x27
429	bic	x28,x21,x27
430	add	x22,x22,x0			// h+=X[i]
431	orr	x17,x17,x28			// Ch(e,f,g)
432	eor	x28,x23,x24			// a^b, b^c in next round
433	eor	x16,x16,x8,ror#18	// Sigma1(e)
434	ror	x8,x23,#28
435	add	x22,x22,x17			// h+=Ch(e,f,g)
436	eor	x17,x23,x23,ror#5
437	add	x22,x22,x16			// h+=Sigma1(e)
438	and	x19,x19,x28			// (b^c)&=(a^b)
439	add	x26,x26,x22			// d+=h
440	eor	x19,x19,x24			// Maj(a,b,c)
441	eor	x17,x8,x17,ror#34	// Sigma0(a)
442	add	x22,x22,x19			// h+=Maj(a,b,c)
443	ldr	x19,[x30],#8		// *K++, x28 in next round
444	//add	x22,x22,x17			// h+=Sigma0(a)
445#ifndef	__AARCH64EB__
446	rev	x1,x1			// 14
447#endif
448	ldr	x6,[sp,#24]
449	add	x22,x22,x17			// h+=Sigma0(a)
450	str	x9,[sp,#16]
451	ror	x16,x26,#14
452	add	x21,x21,x19			// h+=K[i]
453	eor	x9,x26,x26,ror#23
454	and	x17,x27,x26
455	bic	x19,x20,x26
456	add	x21,x21,x1			// h+=X[i]
457	orr	x17,x17,x19			// Ch(e,f,g)
458	eor	x19,x22,x23			// a^b, b^c in next round
459	eor	x16,x16,x9,ror#18	// Sigma1(e)
460	ror	x9,x22,#28
461	add	x21,x21,x17			// h+=Ch(e,f,g)
462	eor	x17,x22,x22,ror#5
463	add	x21,x21,x16			// h+=Sigma1(e)
464	and	x28,x28,x19			// (b^c)&=(a^b)
465	add	x25,x25,x21			// d+=h
466	eor	x28,x28,x23			// Maj(a,b,c)
467	eor	x17,x9,x17,ror#34	// Sigma0(a)
468	add	x21,x21,x28			// h+=Maj(a,b,c)
469	ldr	x28,[x30],#8		// *K++, x19 in next round
470	//add	x21,x21,x17			// h+=Sigma0(a)
471#ifndef	__AARCH64EB__
472	rev	x2,x2			// 15
473#endif
474	ldr	x7,[sp,#0]
475	add	x21,x21,x17			// h+=Sigma0(a)
476	str	x10,[sp,#24]
477	ror	x16,x25,#14
478	add	x20,x20,x28			// h+=K[i]
479	ror	x9,x4,#1
480	and	x17,x26,x25
481	ror	x8,x1,#19
482	bic	x28,x27,x25
483	ror	x10,x21,#28
484	add	x20,x20,x2			// h+=X[i]
485	eor	x16,x16,x25,ror#18
486	eor	x9,x9,x4,ror#8
487	orr	x17,x17,x28			// Ch(e,f,g)
488	eor	x28,x21,x22			// a^b, b^c in next round
489	eor	x16,x16,x25,ror#41	// Sigma1(e)
490	eor	x10,x10,x21,ror#34
491	add	x20,x20,x17			// h+=Ch(e,f,g)
492	and	x19,x19,x28			// (b^c)&=(a^b)
493	eor	x8,x8,x1,ror#61
494	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
495	add	x20,x20,x16			// h+=Sigma1(e)
496	eor	x19,x19,x22			// Maj(a,b,c)
497	eor	x17,x10,x21,ror#39	// Sigma0(a)
498	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
499	add	x3,x3,x12
500	add	x24,x24,x20			// d+=h
501	add	x20,x20,x19			// h+=Maj(a,b,c)
502	ldr	x19,[x30],#8		// *K++, x28 in next round
503	add	x3,x3,x9
504	add	x20,x20,x17			// h+=Sigma0(a)
505	add	x3,x3,x8
506.Loop_16_xx:
507	ldr	x8,[sp,#8]
508	str	x11,[sp,#0]
509	ror	x16,x24,#14
510	add	x27,x27,x19			// h+=K[i]
511	ror	x10,x5,#1
512	and	x17,x25,x24
513	ror	x9,x2,#19
514	bic	x19,x26,x24
515	ror	x11,x20,#28
516	add	x27,x27,x3			// h+=X[i]
517	eor	x16,x16,x24,ror#18
518	eor	x10,x10,x5,ror#8
519	orr	x17,x17,x19			// Ch(e,f,g)
520	eor	x19,x20,x21			// a^b, b^c in next round
521	eor	x16,x16,x24,ror#41	// Sigma1(e)
522	eor	x11,x11,x20,ror#34
523	add	x27,x27,x17			// h+=Ch(e,f,g)
524	and	x28,x28,x19			// (b^c)&=(a^b)
525	eor	x9,x9,x2,ror#61
526	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
527	add	x27,x27,x16			// h+=Sigma1(e)
528	eor	x28,x28,x21			// Maj(a,b,c)
529	eor	x17,x11,x20,ror#39	// Sigma0(a)
530	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
531	add	x4,x4,x13
532	add	x23,x23,x27			// d+=h
533	add	x27,x27,x28			// h+=Maj(a,b,c)
534	ldr	x28,[x30],#8		// *K++, x19 in next round
535	add	x4,x4,x10
536	add	x27,x27,x17			// h+=Sigma0(a)
537	add	x4,x4,x9
538	ldr	x9,[sp,#16]
539	str	x12,[sp,#8]
540	ror	x16,x23,#14
541	add	x26,x26,x28			// h+=K[i]
542	ror	x11,x6,#1
543	and	x17,x24,x23
544	ror	x10,x3,#19
545	bic	x28,x25,x23
546	ror	x12,x27,#28
547	add	x26,x26,x4			// h+=X[i]
548	eor	x16,x16,x23,ror#18
549	eor	x11,x11,x6,ror#8
550	orr	x17,x17,x28			// Ch(e,f,g)
551	eor	x28,x27,x20			// a^b, b^c in next round
552	eor	x16,x16,x23,ror#41	// Sigma1(e)
553	eor	x12,x12,x27,ror#34
554	add	x26,x26,x17			// h+=Ch(e,f,g)
555	and	x19,x19,x28			// (b^c)&=(a^b)
556	eor	x10,x10,x3,ror#61
557	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
558	add	x26,x26,x16			// h+=Sigma1(e)
559	eor	x19,x19,x20			// Maj(a,b,c)
560	eor	x17,x12,x27,ror#39	// Sigma0(a)
561	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
562	add	x5,x5,x14
563	add	x22,x22,x26			// d+=h
564	add	x26,x26,x19			// h+=Maj(a,b,c)
565	ldr	x19,[x30],#8		// *K++, x28 in next round
566	add	x5,x5,x11
567	add	x26,x26,x17			// h+=Sigma0(a)
568	add	x5,x5,x10
569	ldr	x10,[sp,#24]
570	str	x13,[sp,#16]
571	ror	x16,x22,#14
572	add	x25,x25,x19			// h+=K[i]
573	ror	x12,x7,#1
574	and	x17,x23,x22
575	ror	x11,x4,#19
576	bic	x19,x24,x22
577	ror	x13,x26,#28
578	add	x25,x25,x5			// h+=X[i]
579	eor	x16,x16,x22,ror#18
580	eor	x12,x12,x7,ror#8
581	orr	x17,x17,x19			// Ch(e,f,g)
582	eor	x19,x26,x27			// a^b, b^c in next round
583	eor	x16,x16,x22,ror#41	// Sigma1(e)
584	eor	x13,x13,x26,ror#34
585	add	x25,x25,x17			// h+=Ch(e,f,g)
586	and	x28,x28,x19			// (b^c)&=(a^b)
587	eor	x11,x11,x4,ror#61
588	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
589	add	x25,x25,x16			// h+=Sigma1(e)
590	eor	x28,x28,x27			// Maj(a,b,c)
591	eor	x17,x13,x26,ror#39	// Sigma0(a)
592	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
593	add	x6,x6,x15
594	add	x21,x21,x25			// d+=h
595	add	x25,x25,x28			// h+=Maj(a,b,c)
596	ldr	x28,[x30],#8		// *K++, x19 in next round
597	add	x6,x6,x12
598	add	x25,x25,x17			// h+=Sigma0(a)
599	add	x6,x6,x11
600	ldr	x11,[sp,#0]
601	str	x14,[sp,#24]
602	ror	x16,x21,#14
603	add	x24,x24,x28			// h+=K[i]
604	ror	x13,x8,#1
605	and	x17,x22,x21
606	ror	x12,x5,#19
607	bic	x28,x23,x21
608	ror	x14,x25,#28
609	add	x24,x24,x6			// h+=X[i]
610	eor	x16,x16,x21,ror#18
611	eor	x13,x13,x8,ror#8
612	orr	x17,x17,x28			// Ch(e,f,g)
613	eor	x28,x25,x26			// a^b, b^c in next round
614	eor	x16,x16,x21,ror#41	// Sigma1(e)
615	eor	x14,x14,x25,ror#34
616	add	x24,x24,x17			// h+=Ch(e,f,g)
617	and	x19,x19,x28			// (b^c)&=(a^b)
618	eor	x12,x12,x5,ror#61
619	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
620	add	x24,x24,x16			// h+=Sigma1(e)
621	eor	x19,x19,x26			// Maj(a,b,c)
622	eor	x17,x14,x25,ror#39	// Sigma0(a)
623	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
624	add	x7,x7,x0
625	add	x20,x20,x24			// d+=h
626	add	x24,x24,x19			// h+=Maj(a,b,c)
627	ldr	x19,[x30],#8		// *K++, x28 in next round
628	add	x7,x7,x13
629	add	x24,x24,x17			// h+=Sigma0(a)
630	add	x7,x7,x12
631	ldr	x12,[sp,#8]
632	str	x15,[sp,#0]
633	ror	x16,x20,#14
634	add	x23,x23,x19			// h+=K[i]
635	ror	x14,x9,#1
636	and	x17,x21,x20
637	ror	x13,x6,#19
638	bic	x19,x22,x20
639	ror	x15,x24,#28
640	add	x23,x23,x7			// h+=X[i]
641	eor	x16,x16,x20,ror#18
642	eor	x14,x14,x9,ror#8
643	orr	x17,x17,x19			// Ch(e,f,g)
644	eor	x19,x24,x25			// a^b, b^c in next round
645	eor	x16,x16,x20,ror#41	// Sigma1(e)
646	eor	x15,x15,x24,ror#34
647	add	x23,x23,x17			// h+=Ch(e,f,g)
648	and	x28,x28,x19			// (b^c)&=(a^b)
649	eor	x13,x13,x6,ror#61
650	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
651	add	x23,x23,x16			// h+=Sigma1(e)
652	eor	x28,x28,x25			// Maj(a,b,c)
653	eor	x17,x15,x24,ror#39	// Sigma0(a)
654	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
655	add	x8,x8,x1
656	add	x27,x27,x23			// d+=h
657	add	x23,x23,x28			// h+=Maj(a,b,c)
658	ldr	x28,[x30],#8		// *K++, x19 in next round
659	add	x8,x8,x14
660	add	x23,x23,x17			// h+=Sigma0(a)
661	add	x8,x8,x13
662	ldr	x13,[sp,#16]
663	str	x0,[sp,#8]
664	ror	x16,x27,#14
665	add	x22,x22,x28			// h+=K[i]
666	ror	x15,x10,#1
667	and	x17,x20,x27
668	ror	x14,x7,#19
669	bic	x28,x21,x27
670	ror	x0,x23,#28
671	add	x22,x22,x8			// h+=X[i]
672	eor	x16,x16,x27,ror#18
673	eor	x15,x15,x10,ror#8
674	orr	x17,x17,x28			// Ch(e,f,g)
675	eor	x28,x23,x24			// a^b, b^c in next round
676	eor	x16,x16,x27,ror#41	// Sigma1(e)
677	eor	x0,x0,x23,ror#34
678	add	x22,x22,x17			// h+=Ch(e,f,g)
679	and	x19,x19,x28			// (b^c)&=(a^b)
680	eor	x14,x14,x7,ror#61
681	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
682	add	x22,x22,x16			// h+=Sigma1(e)
683	eor	x19,x19,x24			// Maj(a,b,c)
684	eor	x17,x0,x23,ror#39	// Sigma0(a)
685	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
686	add	x9,x9,x2
687	add	x26,x26,x22			// d+=h
688	add	x22,x22,x19			// h+=Maj(a,b,c)
689	ldr	x19,[x30],#8		// *K++, x28 in next round
690	add	x9,x9,x15
691	add	x22,x22,x17			// h+=Sigma0(a)
692	add	x9,x9,x14
693	ldr	x14,[sp,#24]
694	str	x1,[sp,#16]
695	ror	x16,x26,#14
696	add	x21,x21,x19			// h+=K[i]
697	ror	x0,x11,#1
698	and	x17,x27,x26
699	ror	x15,x8,#19
700	bic	x19,x20,x26
701	ror	x1,x22,#28
702	add	x21,x21,x9			// h+=X[i]
703	eor	x16,x16,x26,ror#18
704	eor	x0,x0,x11,ror#8
705	orr	x17,x17,x19			// Ch(e,f,g)
706	eor	x19,x22,x23			// a^b, b^c in next round
707	eor	x16,x16,x26,ror#41	// Sigma1(e)
708	eor	x1,x1,x22,ror#34
709	add	x21,x21,x17			// h+=Ch(e,f,g)
710	and	x28,x28,x19			// (b^c)&=(a^b)
711	eor	x15,x15,x8,ror#61
712	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
713	add	x21,x21,x16			// h+=Sigma1(e)
714	eor	x28,x28,x23			// Maj(a,b,c)
715	eor	x17,x1,x22,ror#39	// Sigma0(a)
716	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
717	add	x10,x10,x3
718	add	x25,x25,x21			// d+=h
719	add	x21,x21,x28			// h+=Maj(a,b,c)
720	ldr	x28,[x30],#8		// *K++, x19 in next round
721	add	x10,x10,x0
722	add	x21,x21,x17			// h+=Sigma0(a)
723	add	x10,x10,x15
724	ldr	x15,[sp,#0]
725	str	x2,[sp,#24]
726	ror	x16,x25,#14
727	add	x20,x20,x28			// h+=K[i]
728	ror	x1,x12,#1
729	and	x17,x26,x25
730	ror	x0,x9,#19
731	bic	x28,x27,x25
732	ror	x2,x21,#28
733	add	x20,x20,x10			// h+=X[i]
734	eor	x16,x16,x25,ror#18
735	eor	x1,x1,x12,ror#8
736	orr	x17,x17,x28			// Ch(e,f,g)
737	eor	x28,x21,x22			// a^b, b^c in next round
738	eor	x16,x16,x25,ror#41	// Sigma1(e)
739	eor	x2,x2,x21,ror#34
740	add	x20,x20,x17			// h+=Ch(e,f,g)
741	and	x19,x19,x28			// (b^c)&=(a^b)
742	eor	x0,x0,x9,ror#61
743	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
744	add	x20,x20,x16			// h+=Sigma1(e)
745	eor	x19,x19,x22			// Maj(a,b,c)
746	eor	x17,x2,x21,ror#39	// Sigma0(a)
747	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
748	add	x11,x11,x4
749	add	x24,x24,x20			// d+=h
750	add	x20,x20,x19			// h+=Maj(a,b,c)
751	ldr	x19,[x30],#8		// *K++, x28 in next round
752	add	x11,x11,x1
753	add	x20,x20,x17			// h+=Sigma0(a)
754	add	x11,x11,x0
755	ldr	x0,[sp,#8]
756	str	x3,[sp,#0]
757	ror	x16,x24,#14
758	add	x27,x27,x19			// h+=K[i]
759	ror	x2,x13,#1
760	and	x17,x25,x24
761	ror	x1,x10,#19
762	bic	x19,x26,x24
763	ror	x3,x20,#28
764	add	x27,x27,x11			// h+=X[i]
765	eor	x16,x16,x24,ror#18
766	eor	x2,x2,x13,ror#8
767	orr	x17,x17,x19			// Ch(e,f,g)
768	eor	x19,x20,x21			// a^b, b^c in next round
769	eor	x16,x16,x24,ror#41	// Sigma1(e)
770	eor	x3,x3,x20,ror#34
771	add	x27,x27,x17			// h+=Ch(e,f,g)
772	and	x28,x28,x19			// (b^c)&=(a^b)
773	eor	x1,x1,x10,ror#61
774	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
775	add	x27,x27,x16			// h+=Sigma1(e)
776	eor	x28,x28,x21			// Maj(a,b,c)
777	eor	x17,x3,x20,ror#39	// Sigma0(a)
778	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
779	add	x12,x12,x5
780	add	x23,x23,x27			// d+=h
781	add	x27,x27,x28			// h+=Maj(a,b,c)
782	ldr	x28,[x30],#8		// *K++, x19 in next round
783	add	x12,x12,x2
784	add	x27,x27,x17			// h+=Sigma0(a)
785	add	x12,x12,x1
786	ldr	x1,[sp,#16]
787	str	x4,[sp,#8]
788	ror	x16,x23,#14
789	add	x26,x26,x28			// h+=K[i]
790	ror	x3,x14,#1
791	and	x17,x24,x23
792	ror	x2,x11,#19
793	bic	x28,x25,x23
794	ror	x4,x27,#28
795	add	x26,x26,x12			// h+=X[i]
796	eor	x16,x16,x23,ror#18
797	eor	x3,x3,x14,ror#8
798	orr	x17,x17,x28			// Ch(e,f,g)
799	eor	x28,x27,x20			// a^b, b^c in next round
800	eor	x16,x16,x23,ror#41	// Sigma1(e)
801	eor	x4,x4,x27,ror#34
802	add	x26,x26,x17			// h+=Ch(e,f,g)
803	and	x19,x19,x28			// (b^c)&=(a^b)
804	eor	x2,x2,x11,ror#61
805	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
806	add	x26,x26,x16			// h+=Sigma1(e)
807	eor	x19,x19,x20			// Maj(a,b,c)
808	eor	x17,x4,x27,ror#39	// Sigma0(a)
809	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
810	add	x13,x13,x6
811	add	x22,x22,x26			// d+=h
812	add	x26,x26,x19			// h+=Maj(a,b,c)
813	ldr	x19,[x30],#8		// *K++, x28 in next round
814	add	x13,x13,x3
815	add	x26,x26,x17			// h+=Sigma0(a)
816	add	x13,x13,x2
817	ldr	x2,[sp,#24]
818	str	x5,[sp,#16]
819	ror	x16,x22,#14
820	add	x25,x25,x19			// h+=K[i]
821	ror	x4,x15,#1
822	and	x17,x23,x22
823	ror	x3,x12,#19
824	bic	x19,x24,x22
825	ror	x5,x26,#28
826	add	x25,x25,x13			// h+=X[i]
827	eor	x16,x16,x22,ror#18
828	eor	x4,x4,x15,ror#8
829	orr	x17,x17,x19			// Ch(e,f,g)
830	eor	x19,x26,x27			// a^b, b^c in next round
831	eor	x16,x16,x22,ror#41	// Sigma1(e)
832	eor	x5,x5,x26,ror#34
833	add	x25,x25,x17			// h+=Ch(e,f,g)
834	and	x28,x28,x19			// (b^c)&=(a^b)
835	eor	x3,x3,x12,ror#61
836	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
837	add	x25,x25,x16			// h+=Sigma1(e)
838	eor	x28,x28,x27			// Maj(a,b,c)
839	eor	x17,x5,x26,ror#39	// Sigma0(a)
840	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
841	add	x14,x14,x7
842	add	x21,x21,x25			// d+=h
843	add	x25,x25,x28			// h+=Maj(a,b,c)
844	ldr	x28,[x30],#8		// *K++, x19 in next round
845	add	x14,x14,x4
846	add	x25,x25,x17			// h+=Sigma0(a)
847	add	x14,x14,x3
848	ldr	x3,[sp,#0]
849	str	x6,[sp,#24]
850	ror	x16,x21,#14
851	add	x24,x24,x28			// h+=K[i]
852	ror	x5,x0,#1
853	and	x17,x22,x21
854	ror	x4,x13,#19
855	bic	x28,x23,x21
856	ror	x6,x25,#28
857	add	x24,x24,x14			// h+=X[i]
858	eor	x16,x16,x21,ror#18
859	eor	x5,x5,x0,ror#8
860	orr	x17,x17,x28			// Ch(e,f,g)
861	eor	x28,x25,x26			// a^b, b^c in next round
862	eor	x16,x16,x21,ror#41	// Sigma1(e)
863	eor	x6,x6,x25,ror#34
864	add	x24,x24,x17			// h+=Ch(e,f,g)
865	and	x19,x19,x28			// (b^c)&=(a^b)
866	eor	x4,x4,x13,ror#61
867	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
868	add	x24,x24,x16			// h+=Sigma1(e)
869	eor	x19,x19,x26			// Maj(a,b,c)
870	eor	x17,x6,x25,ror#39	// Sigma0(a)
871	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
872	add	x15,x15,x8
873	add	x20,x20,x24			// d+=h
874	add	x24,x24,x19			// h+=Maj(a,b,c)
875	ldr	x19,[x30],#8		// *K++, x28 in next round
876	add	x15,x15,x5
877	add	x24,x24,x17			// h+=Sigma0(a)
878	add	x15,x15,x4
879	ldr	x4,[sp,#8]
880	str	x7,[sp,#0]
881	ror	x16,x20,#14
882	add	x23,x23,x19			// h+=K[i]
883	ror	x6,x1,#1
884	and	x17,x21,x20
885	ror	x5,x14,#19
886	bic	x19,x22,x20
887	ror	x7,x24,#28
888	add	x23,x23,x15			// h+=X[i]
889	eor	x16,x16,x20,ror#18
890	eor	x6,x6,x1,ror#8
891	orr	x17,x17,x19			// Ch(e,f,g)
892	eor	x19,x24,x25			// a^b, b^c in next round
893	eor	x16,x16,x20,ror#41	// Sigma1(e)
894	eor	x7,x7,x24,ror#34
895	add	x23,x23,x17			// h+=Ch(e,f,g)
896	and	x28,x28,x19			// (b^c)&=(a^b)
897	eor	x5,x5,x14,ror#61
898	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
899	add	x23,x23,x16			// h+=Sigma1(e)
900	eor	x28,x28,x25			// Maj(a,b,c)
901	eor	x17,x7,x24,ror#39	// Sigma0(a)
902	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
903	add	x0,x0,x9
904	add	x27,x27,x23			// d+=h
905	add	x23,x23,x28			// h+=Maj(a,b,c)
906	ldr	x28,[x30],#8		// *K++, x19 in next round
907	add	x0,x0,x6
908	add	x23,x23,x17			// h+=Sigma0(a)
909	add	x0,x0,x5
910	ldr	x5,[sp,#16]
911	str	x8,[sp,#8]
912	ror	x16,x27,#14
913	add	x22,x22,x28			// h+=K[i]
914	ror	x7,x2,#1
915	and	x17,x20,x27
916	ror	x6,x15,#19
917	bic	x28,x21,x27
918	ror	x8,x23,#28
919	add	x22,x22,x0			// h+=X[i]
920	eor	x16,x16,x27,ror#18
921	eor	x7,x7,x2,ror#8
922	orr	x17,x17,x28			// Ch(e,f,g)
923	eor	x28,x23,x24			// a^b, b^c in next round
924	eor	x16,x16,x27,ror#41	// Sigma1(e)
925	eor	x8,x8,x23,ror#34
926	add	x22,x22,x17			// h+=Ch(e,f,g)
927	and	x19,x19,x28			// (b^c)&=(a^b)
928	eor	x6,x6,x15,ror#61
929	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
930	add	x22,x22,x16			// h+=Sigma1(e)
931	eor	x19,x19,x24			// Maj(a,b,c)
932	eor	x17,x8,x23,ror#39	// Sigma0(a)
933	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
934	add	x1,x1,x10
935	add	x26,x26,x22			// d+=h
936	add	x22,x22,x19			// h+=Maj(a,b,c)
937	ldr	x19,[x30],#8		// *K++, x28 in next round
938	add	x1,x1,x7
939	add	x22,x22,x17			// h+=Sigma0(a)
940	add	x1,x1,x6
941	ldr	x6,[sp,#24]
942	str	x9,[sp,#16]
943	ror	x16,x26,#14
944	add	x21,x21,x19			// h+=K[i]
945	ror	x8,x3,#1
946	and	x17,x27,x26
947	ror	x7,x0,#19
948	bic	x19,x20,x26
949	ror	x9,x22,#28
950	add	x21,x21,x1			// h+=X[i]
951	eor	x16,x16,x26,ror#18
952	eor	x8,x8,x3,ror#8
953	orr	x17,x17,x19			// Ch(e,f,g)
954	eor	x19,x22,x23			// a^b, b^c in next round
955	eor	x16,x16,x26,ror#41	// Sigma1(e)
956	eor	x9,x9,x22,ror#34
957	add	x21,x21,x17			// h+=Ch(e,f,g)
958	and	x28,x28,x19			// (b^c)&=(a^b)
959	eor	x7,x7,x0,ror#61
960	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
961	add	x21,x21,x16			// h+=Sigma1(e)
962	eor	x28,x28,x23			// Maj(a,b,c)
963	eor	x17,x9,x22,ror#39	// Sigma0(a)
964	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
965	add	x2,x2,x11
966	add	x25,x25,x21			// d+=h
967	add	x21,x21,x28			// h+=Maj(a,b,c)
968	ldr	x28,[x30],#8		// *K++, x19 in next round
969	add	x2,x2,x8
970	add	x21,x21,x17			// h+=Sigma0(a)
971	add	x2,x2,x7
972	ldr	x7,[sp,#0]
973	str	x10,[sp,#24]
974	ror	x16,x25,#14
975	add	x20,x20,x28			// h+=K[i]
976	ror	x9,x4,#1
977	and	x17,x26,x25
978	ror	x8,x1,#19
979	bic	x28,x27,x25
980	ror	x10,x21,#28
981	add	x20,x20,x2			// h+=X[i]
982	eor	x16,x16,x25,ror#18
983	eor	x9,x9,x4,ror#8
984	orr	x17,x17,x28			// Ch(e,f,g)
985	eor	x28,x21,x22			// a^b, b^c in next round
986	eor	x16,x16,x25,ror#41	// Sigma1(e)
987	eor	x10,x10,x21,ror#34
988	add	x20,x20,x17			// h+=Ch(e,f,g)
989	and	x19,x19,x28			// (b^c)&=(a^b)
990	eor	x8,x8,x1,ror#61
991	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
992	add	x20,x20,x16			// h+=Sigma1(e)
993	eor	x19,x19,x22			// Maj(a,b,c)
994	eor	x17,x10,x21,ror#39	// Sigma0(a)
995	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
996	add	x3,x3,x12
997	add	x24,x24,x20			// d+=h
998	add	x20,x20,x19			// h+=Maj(a,b,c)
999	ldr	x19,[x30],#8		// *K++, x28 in next round
1000	add	x3,x3,x9
1001	add	x20,x20,x17			// h+=Sigma0(a)
1002	add	x3,x3,x8
1003	cbnz	x19,.Loop_16_xx
1004
1005	ldp	x0,x2,[x29,#96]
1006	ldr	x1,[x29,#112]
1007	sub	x30,x30,#648		// rewind
1008
1009	ldp	x3,x4,[x0]
1010	ldp	x5,x6,[x0,#2*8]
1011	add	x1,x1,#14*8			// advance input pointer
1012	ldp	x7,x8,[x0,#4*8]
1013	add	x20,x20,x3
1014	ldp	x9,x10,[x0,#6*8]
1015	add	x21,x21,x4
1016	add	x22,x22,x5
1017	add	x23,x23,x6
1018	stp	x20,x21,[x0]
1019	add	x24,x24,x7
1020	add	x25,x25,x8
1021	stp	x22,x23,[x0,#2*8]
1022	add	x26,x26,x9
1023	add	x27,x27,x10
1024	cmp	x1,x2
1025	stp	x24,x25,[x0,#4*8]
1026	stp	x26,x27,[x0,#6*8]
1027	b.ne	.Loop
1028
1029	ldp	x19,x20,[x29,#16]
1030	add	sp,sp,#4*8
1031	ldp	x21,x22,[x29,#32]
1032	ldp	x23,x24,[x29,#48]
1033	ldp	x25,x26,[x29,#64]
1034	ldp	x27,x28,[x29,#80]
1035	ldp	x29,x30,[sp],#128
1036.inst	0xd50323bf				// autiasp
1037	ret
1038.size	sha512_block_data_order,.-sha512_block_data_order
1039
1040.align	6
1041.type	.LK512,%object
1042.LK512:
1043.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1044.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1045.quad	0x3956c25bf348b538,0x59f111f1b605d019
1046.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1047.quad	0xd807aa98a3030242,0x12835b0145706fbe
1048.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1049.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1050.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1051.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1052.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1053.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1054.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1055.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1056.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1057.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1058.quad	0x06ca6351e003826f,0x142929670a0e6e70
1059.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1060.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1061.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1062.quad	0x81c2c92e47edaee6,0x92722c851482353b
1063.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1064.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1065.quad	0xd192e819d6ef5218,0xd69906245565a910
1066.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1067.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1068.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1069.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1070.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1071.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1072.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1073.quad	0x90befffa23631e28,0xa4506cebde82bde9
1074.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1075.quad	0xca273eceea26619c,0xd186b8c721c0c207
1076.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1077.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1078.quad	0x113f9804bef90dae,0x1b710b35131c471b
1079.quad	0x28db77f523047d84,0x32caab7b40c72493
1080.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1081.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1082.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1083.quad	0	// terminator
1084.size	.LK512,.-.LK512
1085.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1086.align	2
1087.align	2
1088#ifndef	__KERNEL__
1089.type	sha512_block_armv8,%function
1090.align	6
1091sha512_block_armv8:
1092.Lv8_entry:
1093	stp	x29,x30,[sp,#-16]!
1094	add	x29,sp,#0
1095
1096	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1097	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1098
1099	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1100	adr	x3,.LK512
1101
1102	rev64	v16.16b,v16.16b
1103	rev64	v17.16b,v17.16b
1104	rev64	v18.16b,v18.16b
1105	rev64	v19.16b,v19.16b
1106	rev64	v20.16b,v20.16b
1107	rev64	v21.16b,v21.16b
1108	rev64	v22.16b,v22.16b
1109	rev64	v23.16b,v23.16b
1110	b	.Loop_hw
1111
1112.align	4
1113.Loop_hw:
1114	ld1	{v24.2d},[x3],#16
1115	subs	x2,x2,#1
1116	sub	x4,x1,#128
1117	orr	v26.16b,v0.16b,v0.16b			// offload
1118	orr	v27.16b,v1.16b,v1.16b
1119	orr	v28.16b,v2.16b,v2.16b
1120	orr	v29.16b,v3.16b,v3.16b
1121	csel	x1,x1,x4,ne			// conditional rewind
1122	add	v24.2d,v24.2d,v16.2d
1123	ld1	{v25.2d},[x3],#16
1124	ext	v24.16b,v24.16b,v24.16b,#8
1125	ext	v5.16b,v2.16b,v3.16b,#8
1126	ext	v6.16b,v1.16b,v2.16b,#8
1127	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1128.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1129	ext	v7.16b,v20.16b,v21.16b,#8
1130.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1131.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1132	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1133.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1134	add	v25.2d,v25.2d,v17.2d
1135	ld1	{v24.2d},[x3],#16
1136	ext	v25.16b,v25.16b,v25.16b,#8
1137	ext	v5.16b,v4.16b,v2.16b,#8
1138	ext	v6.16b,v0.16b,v4.16b,#8
1139	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1140.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1141	ext	v7.16b,v21.16b,v22.16b,#8
1142.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1143.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1144	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1145.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1146	add	v24.2d,v24.2d,v18.2d
1147	ld1	{v25.2d},[x3],#16
1148	ext	v24.16b,v24.16b,v24.16b,#8
1149	ext	v5.16b,v1.16b,v4.16b,#8
1150	ext	v6.16b,v3.16b,v1.16b,#8
1151	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1152.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1153	ext	v7.16b,v22.16b,v23.16b,#8
1154.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1155.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1156	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1157.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1158	add	v25.2d,v25.2d,v19.2d
1159	ld1	{v24.2d},[x3],#16
1160	ext	v25.16b,v25.16b,v25.16b,#8
1161	ext	v5.16b,v0.16b,v1.16b,#8
1162	ext	v6.16b,v2.16b,v0.16b,#8
1163	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1164.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1165	ext	v7.16b,v23.16b,v16.16b,#8
1166.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1167.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1168	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1169.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1170	add	v24.2d,v24.2d,v20.2d
1171	ld1	{v25.2d},[x3],#16
1172	ext	v24.16b,v24.16b,v24.16b,#8
1173	ext	v5.16b,v3.16b,v0.16b,#8
1174	ext	v6.16b,v4.16b,v3.16b,#8
1175	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1176.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1177	ext	v7.16b,v16.16b,v17.16b,#8
1178.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1179.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1180	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1181.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1182	add	v25.2d,v25.2d,v21.2d
1183	ld1	{v24.2d},[x3],#16
1184	ext	v25.16b,v25.16b,v25.16b,#8
1185	ext	v5.16b,v2.16b,v3.16b,#8
1186	ext	v6.16b,v1.16b,v2.16b,#8
1187	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1188.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1189	ext	v7.16b,v17.16b,v18.16b,#8
1190.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1191.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1192	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1193.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1194	add	v24.2d,v24.2d,v22.2d
1195	ld1	{v25.2d},[x3],#16
1196	ext	v24.16b,v24.16b,v24.16b,#8
1197	ext	v5.16b,v4.16b,v2.16b,#8
1198	ext	v6.16b,v0.16b,v4.16b,#8
1199	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1200.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1201	ext	v7.16b,v18.16b,v19.16b,#8
1202.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1203.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1204	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1205.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1206	add	v25.2d,v25.2d,v23.2d
1207	ld1	{v24.2d},[x3],#16
1208	ext	v25.16b,v25.16b,v25.16b,#8
1209	ext	v5.16b,v1.16b,v4.16b,#8
1210	ext	v6.16b,v3.16b,v1.16b,#8
1211	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1212.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1213	ext	v7.16b,v19.16b,v20.16b,#8
1214.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1215.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1216	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1217.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1218	add	v24.2d,v24.2d,v16.2d
1219	ld1	{v25.2d},[x3],#16
1220	ext	v24.16b,v24.16b,v24.16b,#8
1221	ext	v5.16b,v0.16b,v1.16b,#8
1222	ext	v6.16b,v2.16b,v0.16b,#8
1223	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1224.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1225	ext	v7.16b,v20.16b,v21.16b,#8
1226.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1227.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1228	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1229.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1230	add	v25.2d,v25.2d,v17.2d
1231	ld1	{v24.2d},[x3],#16
1232	ext	v25.16b,v25.16b,v25.16b,#8
1233	ext	v5.16b,v3.16b,v0.16b,#8
1234	ext	v6.16b,v4.16b,v3.16b,#8
1235	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1236.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1237	ext	v7.16b,v21.16b,v22.16b,#8
1238.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1239.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1240	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1241.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1242	add	v24.2d,v24.2d,v18.2d
1243	ld1	{v25.2d},[x3],#16
1244	ext	v24.16b,v24.16b,v24.16b,#8
1245	ext	v5.16b,v2.16b,v3.16b,#8
1246	ext	v6.16b,v1.16b,v2.16b,#8
1247	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1248.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1249	ext	v7.16b,v22.16b,v23.16b,#8
1250.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1251.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1252	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1253.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1254	add	v25.2d,v25.2d,v19.2d
1255	ld1	{v24.2d},[x3],#16
1256	ext	v25.16b,v25.16b,v25.16b,#8
1257	ext	v5.16b,v4.16b,v2.16b,#8
1258	ext	v6.16b,v0.16b,v4.16b,#8
1259	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1260.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1261	ext	v7.16b,v23.16b,v16.16b,#8
1262.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1263.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1264	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1265.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1266	add	v24.2d,v24.2d,v20.2d
1267	ld1	{v25.2d},[x3],#16
1268	ext	v24.16b,v24.16b,v24.16b,#8
1269	ext	v5.16b,v1.16b,v4.16b,#8
1270	ext	v6.16b,v3.16b,v1.16b,#8
1271	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1272.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1273	ext	v7.16b,v16.16b,v17.16b,#8
1274.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1275.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1276	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1277.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1278	add	v25.2d,v25.2d,v21.2d
1279	ld1	{v24.2d},[x3],#16
1280	ext	v25.16b,v25.16b,v25.16b,#8
1281	ext	v5.16b,v0.16b,v1.16b,#8
1282	ext	v6.16b,v2.16b,v0.16b,#8
1283	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1284.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1285	ext	v7.16b,v17.16b,v18.16b,#8
1286.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1287.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1288	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1289.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1290	add	v24.2d,v24.2d,v22.2d
1291	ld1	{v25.2d},[x3],#16
1292	ext	v24.16b,v24.16b,v24.16b,#8
1293	ext	v5.16b,v3.16b,v0.16b,#8
1294	ext	v6.16b,v4.16b,v3.16b,#8
1295	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1296.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1297	ext	v7.16b,v18.16b,v19.16b,#8
1298.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1299.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1300	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1301.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1302	add	v25.2d,v25.2d,v23.2d
1303	ld1	{v24.2d},[x3],#16
1304	ext	v25.16b,v25.16b,v25.16b,#8
1305	ext	v5.16b,v2.16b,v3.16b,#8
1306	ext	v6.16b,v1.16b,v2.16b,#8
1307	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1308.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1309	ext	v7.16b,v19.16b,v20.16b,#8
1310.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1311.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1312	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1313.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1314	add	v24.2d,v24.2d,v16.2d
1315	ld1	{v25.2d},[x3],#16
1316	ext	v24.16b,v24.16b,v24.16b,#8
1317	ext	v5.16b,v4.16b,v2.16b,#8
1318	ext	v6.16b,v0.16b,v4.16b,#8
1319	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1320.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1321	ext	v7.16b,v20.16b,v21.16b,#8
1322.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1323.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1324	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1325.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1326	add	v25.2d,v25.2d,v17.2d
1327	ld1	{v24.2d},[x3],#16
1328	ext	v25.16b,v25.16b,v25.16b,#8
1329	ext	v5.16b,v1.16b,v4.16b,#8
1330	ext	v6.16b,v3.16b,v1.16b,#8
1331	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1332.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1333	ext	v7.16b,v21.16b,v22.16b,#8
1334.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1335.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1336	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1337.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1338	add	v24.2d,v24.2d,v18.2d
1339	ld1	{v25.2d},[x3],#16
1340	ext	v24.16b,v24.16b,v24.16b,#8
1341	ext	v5.16b,v0.16b,v1.16b,#8
1342	ext	v6.16b,v2.16b,v0.16b,#8
1343	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1344.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1345	ext	v7.16b,v22.16b,v23.16b,#8
1346.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1347.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1348	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1349.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1350	add	v25.2d,v25.2d,v19.2d
1351	ld1	{v24.2d},[x3],#16
1352	ext	v25.16b,v25.16b,v25.16b,#8
1353	ext	v5.16b,v3.16b,v0.16b,#8
1354	ext	v6.16b,v4.16b,v3.16b,#8
1355	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1356.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1357	ext	v7.16b,v23.16b,v16.16b,#8
1358.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1359.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1360	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1361.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1362	add	v24.2d,v24.2d,v20.2d
1363	ld1	{v25.2d},[x3],#16
1364	ext	v24.16b,v24.16b,v24.16b,#8
1365	ext	v5.16b,v2.16b,v3.16b,#8
1366	ext	v6.16b,v1.16b,v2.16b,#8
1367	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1368.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1369	ext	v7.16b,v16.16b,v17.16b,#8
1370.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1371.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1372	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1373.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1374	add	v25.2d,v25.2d,v21.2d
1375	ld1	{v24.2d},[x3],#16
1376	ext	v25.16b,v25.16b,v25.16b,#8
1377	ext	v5.16b,v4.16b,v2.16b,#8
1378	ext	v6.16b,v0.16b,v4.16b,#8
1379	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1380.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1381	ext	v7.16b,v17.16b,v18.16b,#8
1382.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1383.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1384	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1385.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1386	add	v24.2d,v24.2d,v22.2d
1387	ld1	{v25.2d},[x3],#16
1388	ext	v24.16b,v24.16b,v24.16b,#8
1389	ext	v5.16b,v1.16b,v4.16b,#8
1390	ext	v6.16b,v3.16b,v1.16b,#8
1391	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1392.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1393	ext	v7.16b,v18.16b,v19.16b,#8
1394.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1395.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1396	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1397.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1398	add	v25.2d,v25.2d,v23.2d
1399	ld1	{v24.2d},[x3],#16
1400	ext	v25.16b,v25.16b,v25.16b,#8
1401	ext	v5.16b,v0.16b,v1.16b,#8
1402	ext	v6.16b,v2.16b,v0.16b,#8
1403	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1404.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1405	ext	v7.16b,v19.16b,v20.16b,#8
1406.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1407.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1408	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1409.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1410	add	v24.2d,v24.2d,v16.2d
1411	ld1	{v25.2d},[x3],#16
1412	ext	v24.16b,v24.16b,v24.16b,#8
1413	ext	v5.16b,v3.16b,v0.16b,#8
1414	ext	v6.16b,v4.16b,v3.16b,#8
1415	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1416.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1417	ext	v7.16b,v20.16b,v21.16b,#8
1418.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1419.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1420	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1421.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1422	add	v25.2d,v25.2d,v17.2d
1423	ld1	{v24.2d},[x3],#16
1424	ext	v25.16b,v25.16b,v25.16b,#8
1425	ext	v5.16b,v2.16b,v3.16b,#8
1426	ext	v6.16b,v1.16b,v2.16b,#8
1427	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1428.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1429	ext	v7.16b,v21.16b,v22.16b,#8
1430.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1431.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1432	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1433.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1434	add	v24.2d,v24.2d,v18.2d
1435	ld1	{v25.2d},[x3],#16
1436	ext	v24.16b,v24.16b,v24.16b,#8
1437	ext	v5.16b,v4.16b,v2.16b,#8
1438	ext	v6.16b,v0.16b,v4.16b,#8
1439	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1440.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1441	ext	v7.16b,v22.16b,v23.16b,#8
1442.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1443.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1444	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1445.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1446	add	v25.2d,v25.2d,v19.2d
1447	ld1	{v24.2d},[x3],#16
1448	ext	v25.16b,v25.16b,v25.16b,#8
1449	ext	v5.16b,v1.16b,v4.16b,#8
1450	ext	v6.16b,v3.16b,v1.16b,#8
1451	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1452.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1453	ext	v7.16b,v23.16b,v16.16b,#8
1454.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1455.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1456	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1457.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1458	add	v24.2d,v24.2d,v20.2d
1459	ld1	{v25.2d},[x3],#16
1460	ext	v24.16b,v24.16b,v24.16b,#8
1461	ext	v5.16b,v0.16b,v1.16b,#8
1462	ext	v6.16b,v2.16b,v0.16b,#8
1463	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1464.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1465	ext	v7.16b,v16.16b,v17.16b,#8
1466.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1467.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1468	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1469.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1470	add	v25.2d,v25.2d,v21.2d
1471	ld1	{v24.2d},[x3],#16
1472	ext	v25.16b,v25.16b,v25.16b,#8
1473	ext	v5.16b,v3.16b,v0.16b,#8
1474	ext	v6.16b,v4.16b,v3.16b,#8
1475	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1476.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1477	ext	v7.16b,v17.16b,v18.16b,#8
1478.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1479.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1480	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1481.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1482	add	v24.2d,v24.2d,v22.2d
1483	ld1	{v25.2d},[x3],#16
1484	ext	v24.16b,v24.16b,v24.16b,#8
1485	ext	v5.16b,v2.16b,v3.16b,#8
1486	ext	v6.16b,v1.16b,v2.16b,#8
1487	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1488.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1489	ext	v7.16b,v18.16b,v19.16b,#8
1490.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1491.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1492	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1493.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1494	add	v25.2d,v25.2d,v23.2d
1495	ld1	{v24.2d},[x3],#16
1496	ext	v25.16b,v25.16b,v25.16b,#8
1497	ext	v5.16b,v4.16b,v2.16b,#8
1498	ext	v6.16b,v0.16b,v4.16b,#8
1499	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1500.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1501	ext	v7.16b,v19.16b,v20.16b,#8
1502.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1503.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1504	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1505.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1506	ld1	{v25.2d},[x3],#16
1507	add	v24.2d,v24.2d,v16.2d
1508	ld1	{v16.16b},[x1],#16		// load next input
1509	ext	v24.16b,v24.16b,v24.16b,#8
1510	ext	v5.16b,v1.16b,v4.16b,#8
1511	ext	v6.16b,v3.16b,v1.16b,#8
1512	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1513.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1514	rev64	v16.16b,v16.16b
1515	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1516.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1517	ld1	{v24.2d},[x3],#16
1518	add	v25.2d,v25.2d,v17.2d
1519	ld1	{v17.16b},[x1],#16		// load next input
1520	ext	v25.16b,v25.16b,v25.16b,#8
1521	ext	v5.16b,v0.16b,v1.16b,#8
1522	ext	v6.16b,v2.16b,v0.16b,#8
1523	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1524.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1525	rev64	v17.16b,v17.16b
1526	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1527.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1528	ld1	{v25.2d},[x3],#16
1529	add	v24.2d,v24.2d,v18.2d
1530	ld1	{v18.16b},[x1],#16		// load next input
1531	ext	v24.16b,v24.16b,v24.16b,#8
1532	ext	v5.16b,v3.16b,v0.16b,#8
1533	ext	v6.16b,v4.16b,v3.16b,#8
1534	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1535.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1536	rev64	v18.16b,v18.16b
1537	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1538.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1539	ld1	{v24.2d},[x3],#16
1540	add	v25.2d,v25.2d,v19.2d
1541	ld1	{v19.16b},[x1],#16		// load next input
1542	ext	v25.16b,v25.16b,v25.16b,#8
1543	ext	v5.16b,v2.16b,v3.16b,#8
1544	ext	v6.16b,v1.16b,v2.16b,#8
1545	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1546.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1547	rev64	v19.16b,v19.16b
1548	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1549.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1550	ld1	{v25.2d},[x3],#16
1551	add	v24.2d,v24.2d,v20.2d
1552	ld1	{v20.16b},[x1],#16		// load next input
1553	ext	v24.16b,v24.16b,v24.16b,#8
1554	ext	v5.16b,v4.16b,v2.16b,#8
1555	ext	v6.16b,v0.16b,v4.16b,#8
1556	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1557.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1558	rev64	v20.16b,v20.16b
1559	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1560.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1561	ld1	{v24.2d},[x3],#16
1562	add	v25.2d,v25.2d,v21.2d
1563	ld1	{v21.16b},[x1],#16		// load next input
1564	ext	v25.16b,v25.16b,v25.16b,#8
1565	ext	v5.16b,v1.16b,v4.16b,#8
1566	ext	v6.16b,v3.16b,v1.16b,#8
1567	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1568.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1569	rev64	v21.16b,v21.16b
1570	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1571.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1572	ld1	{v25.2d},[x3],#16
1573	add	v24.2d,v24.2d,v22.2d
1574	ld1	{v22.16b},[x1],#16		// load next input
1575	ext	v24.16b,v24.16b,v24.16b,#8
1576	ext	v5.16b,v0.16b,v1.16b,#8
1577	ext	v6.16b,v2.16b,v0.16b,#8
1578	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1579.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1580	rev64	v22.16b,v22.16b
1581	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1582.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1583	sub	x3,x3,#80*8	// rewind
1584	add	v25.2d,v25.2d,v23.2d
1585	ld1	{v23.16b},[x1],#16		// load next input
1586	ext	v25.16b,v25.16b,v25.16b,#8
1587	ext	v5.16b,v3.16b,v0.16b,#8
1588	ext	v6.16b,v4.16b,v3.16b,#8
1589	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1590.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1591	rev64	v23.16b,v23.16b
1592	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1593.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1594	add	v0.2d,v0.2d,v26.2d			// accumulate
1595	add	v1.2d,v1.2d,v27.2d
1596	add	v2.2d,v2.2d,v28.2d
1597	add	v3.2d,v3.2d,v29.2d
1598
1599	cbnz	x2,.Loop_hw
1600
1601	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1602
1603	ldr	x29,[sp],#16
1604	ret
1605.size	sha512_block_armv8,.-sha512_block_armv8
1606#endif
1607