• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
2//
3// Licensed under the OpenSSL license (the "License").  You may not use
4// this file except in compliance with the License.  You can obtain a copy
5// in the file LICENSE in the source distribution or at
6// https://www.openssl.org/source/license.html
7
8// ====================================================================
9// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10// project. The module is, however, dual licensed under OpenSSL and
11// CRYPTOGAMS licenses depending on where you obtain it. For further
12// details see http://www.openssl.org/~appro/cryptogams/.
13//
14// Permission to use under GPLv2 terms is granted.
15// ====================================================================
16//
17// SHA256/512 for ARMv8.
18//
19// Performance in cycles per processed byte and improvement coefficient
20// over code generated with "default" compiler:
21//
22//		SHA256-hw	SHA256(*)	SHA512
23// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
24// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
25// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
26// Denver	2.01		10.5 (+26%)	6.70 (+8%)
27// X-Gene			20.0 (+100%)	12.8 (+300%(***))
28// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
29//
30// (*)	Software SHA256 results are of lesser relevance, presented
31//	mostly for informational purposes.
32// (**)	The result is a trade-off: it's possible to improve it by
33//	10% (or by 1 cycle per round), but at the cost of 20% loss
34//	on Cortex-A53 (or by 4 cycles per round).
35// (***)	Super-impressive coefficients over gcc-generated code are
36//	indication of some compiler "pathology", most notably code
37//	generated with -mgeneral-regs-only is significanty faster
38//	and the gap is only 40-90%.
39//
40// October 2016.
41//
42// Originally it was reckoned that it makes no sense to implement NEON
43// version of SHA256 for 64-bit processors. This is because performance
44// improvement on most wide-spread Cortex-A5x processors was observed
45// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
46// observed that 32-bit NEON SHA256 performs significantly better than
47// 64-bit scalar version on *some* of the more recent processors. As
48// result 64-bit NEON version of SHA256 was added to provide best
49// all-round performance. For example it executes ~30% faster on X-Gene
50// and Mongoose. [For reference, NEON version of SHA512 is bound to
51// deliver much less improvement, likely *negative* on Cortex-A5x.
52// Which is why NEON support is limited to SHA256.]
53
54#ifndef	__KERNEL__
55# include "arm_arch.h"
56#endif
57
58.text
59
60.extern	OPENSSL_armcap_P
61.globl	sha512_block_data_order
62.type	sha512_block_data_order,%function
63.align	6
64sha512_block_data_order:
65	stp	x29,x30,[sp,#-128]!
66	add	x29,sp,#0
67
68	stp	x19,x20,[sp,#16]
69	stp	x21,x22,[sp,#32]
70	stp	x23,x24,[sp,#48]
71	stp	x25,x26,[sp,#64]
72	stp	x27,x28,[sp,#80]
73	sub	sp,sp,#4*8
74
75	ldp	x20,x21,[x0]				// load context
76	ldp	x22,x23,[x0,#2*8]
77	ldp	x24,x25,[x0,#4*8]
78	add	x2,x1,x2,lsl#7	// end of input
79	ldp	x26,x27,[x0,#6*8]
80	adr	x30,.LK512
81	stp	x0,x2,[x29,#96]
82
83.Loop:
84	ldp	x3,x4,[x1],#2*8
85	ldr	x19,[x30],#8			// *K++
86	eor	x28,x21,x22				// magic seed
87	str	x1,[x29,#112]
88#ifndef	__AARCH64EB__
89	rev	x3,x3			// 0
90#endif
91	ror	x16,x24,#14
92	add	x27,x27,x19			// h+=K[i]
93	eor	x6,x24,x24,ror#23
94	and	x17,x25,x24
95	bic	x19,x26,x24
96	add	x27,x27,x3			// h+=X[i]
97	orr	x17,x17,x19			// Ch(e,f,g)
98	eor	x19,x20,x21			// a^b, b^c in next round
99	eor	x16,x16,x6,ror#18	// Sigma1(e)
100	ror	x6,x20,#28
101	add	x27,x27,x17			// h+=Ch(e,f,g)
102	eor	x17,x20,x20,ror#5
103	add	x27,x27,x16			// h+=Sigma1(e)
104	and	x28,x28,x19			// (b^c)&=(a^b)
105	add	x23,x23,x27			// d+=h
106	eor	x28,x28,x21			// Maj(a,b,c)
107	eor	x17,x6,x17,ror#34	// Sigma0(a)
108	add	x27,x27,x28			// h+=Maj(a,b,c)
109	ldr	x28,[x30],#8		// *K++, x19 in next round
110	//add	x27,x27,x17			// h+=Sigma0(a)
111#ifndef	__AARCH64EB__
112	rev	x4,x4			// 1
113#endif
114	ldp	x5,x6,[x1],#2*8
115	add	x27,x27,x17			// h+=Sigma0(a)
116	ror	x16,x23,#14
117	add	x26,x26,x28			// h+=K[i]
118	eor	x7,x23,x23,ror#23
119	and	x17,x24,x23
120	bic	x28,x25,x23
121	add	x26,x26,x4			// h+=X[i]
122	orr	x17,x17,x28			// Ch(e,f,g)
123	eor	x28,x27,x20			// a^b, b^c in next round
124	eor	x16,x16,x7,ror#18	// Sigma1(e)
125	ror	x7,x27,#28
126	add	x26,x26,x17			// h+=Ch(e,f,g)
127	eor	x17,x27,x27,ror#5
128	add	x26,x26,x16			// h+=Sigma1(e)
129	and	x19,x19,x28			// (b^c)&=(a^b)
130	add	x22,x22,x26			// d+=h
131	eor	x19,x19,x20			// Maj(a,b,c)
132	eor	x17,x7,x17,ror#34	// Sigma0(a)
133	add	x26,x26,x19			// h+=Maj(a,b,c)
134	ldr	x19,[x30],#8		// *K++, x28 in next round
135	//add	x26,x26,x17			// h+=Sigma0(a)
136#ifndef	__AARCH64EB__
137	rev	x5,x5			// 2
138#endif
139	add	x26,x26,x17			// h+=Sigma0(a)
140	ror	x16,x22,#14
141	add	x25,x25,x19			// h+=K[i]
142	eor	x8,x22,x22,ror#23
143	and	x17,x23,x22
144	bic	x19,x24,x22
145	add	x25,x25,x5			// h+=X[i]
146	orr	x17,x17,x19			// Ch(e,f,g)
147	eor	x19,x26,x27			// a^b, b^c in next round
148	eor	x16,x16,x8,ror#18	// Sigma1(e)
149	ror	x8,x26,#28
150	add	x25,x25,x17			// h+=Ch(e,f,g)
151	eor	x17,x26,x26,ror#5
152	add	x25,x25,x16			// h+=Sigma1(e)
153	and	x28,x28,x19			// (b^c)&=(a^b)
154	add	x21,x21,x25			// d+=h
155	eor	x28,x28,x27			// Maj(a,b,c)
156	eor	x17,x8,x17,ror#34	// Sigma0(a)
157	add	x25,x25,x28			// h+=Maj(a,b,c)
158	ldr	x28,[x30],#8		// *K++, x19 in next round
159	//add	x25,x25,x17			// h+=Sigma0(a)
160#ifndef	__AARCH64EB__
161	rev	x6,x6			// 3
162#endif
163	ldp	x7,x8,[x1],#2*8
164	add	x25,x25,x17			// h+=Sigma0(a)
165	ror	x16,x21,#14
166	add	x24,x24,x28			// h+=K[i]
167	eor	x9,x21,x21,ror#23
168	and	x17,x22,x21
169	bic	x28,x23,x21
170	add	x24,x24,x6			// h+=X[i]
171	orr	x17,x17,x28			// Ch(e,f,g)
172	eor	x28,x25,x26			// a^b, b^c in next round
173	eor	x16,x16,x9,ror#18	// Sigma1(e)
174	ror	x9,x25,#28
175	add	x24,x24,x17			// h+=Ch(e,f,g)
176	eor	x17,x25,x25,ror#5
177	add	x24,x24,x16			// h+=Sigma1(e)
178	and	x19,x19,x28			// (b^c)&=(a^b)
179	add	x20,x20,x24			// d+=h
180	eor	x19,x19,x26			// Maj(a,b,c)
181	eor	x17,x9,x17,ror#34	// Sigma0(a)
182	add	x24,x24,x19			// h+=Maj(a,b,c)
183	ldr	x19,[x30],#8		// *K++, x28 in next round
184	//add	x24,x24,x17			// h+=Sigma0(a)
185#ifndef	__AARCH64EB__
186	rev	x7,x7			// 4
187#endif
188	add	x24,x24,x17			// h+=Sigma0(a)
189	ror	x16,x20,#14
190	add	x23,x23,x19			// h+=K[i]
191	eor	x10,x20,x20,ror#23
192	and	x17,x21,x20
193	bic	x19,x22,x20
194	add	x23,x23,x7			// h+=X[i]
195	orr	x17,x17,x19			// Ch(e,f,g)
196	eor	x19,x24,x25			// a^b, b^c in next round
197	eor	x16,x16,x10,ror#18	// Sigma1(e)
198	ror	x10,x24,#28
199	add	x23,x23,x17			// h+=Ch(e,f,g)
200	eor	x17,x24,x24,ror#5
201	add	x23,x23,x16			// h+=Sigma1(e)
202	and	x28,x28,x19			// (b^c)&=(a^b)
203	add	x27,x27,x23			// d+=h
204	eor	x28,x28,x25			// Maj(a,b,c)
205	eor	x17,x10,x17,ror#34	// Sigma0(a)
206	add	x23,x23,x28			// h+=Maj(a,b,c)
207	ldr	x28,[x30],#8		// *K++, x19 in next round
208	//add	x23,x23,x17			// h+=Sigma0(a)
209#ifndef	__AARCH64EB__
210	rev	x8,x8			// 5
211#endif
212	ldp	x9,x10,[x1],#2*8
213	add	x23,x23,x17			// h+=Sigma0(a)
214	ror	x16,x27,#14
215	add	x22,x22,x28			// h+=K[i]
216	eor	x11,x27,x27,ror#23
217	and	x17,x20,x27
218	bic	x28,x21,x27
219	add	x22,x22,x8			// h+=X[i]
220	orr	x17,x17,x28			// Ch(e,f,g)
221	eor	x28,x23,x24			// a^b, b^c in next round
222	eor	x16,x16,x11,ror#18	// Sigma1(e)
223	ror	x11,x23,#28
224	add	x22,x22,x17			// h+=Ch(e,f,g)
225	eor	x17,x23,x23,ror#5
226	add	x22,x22,x16			// h+=Sigma1(e)
227	and	x19,x19,x28			// (b^c)&=(a^b)
228	add	x26,x26,x22			// d+=h
229	eor	x19,x19,x24			// Maj(a,b,c)
230	eor	x17,x11,x17,ror#34	// Sigma0(a)
231	add	x22,x22,x19			// h+=Maj(a,b,c)
232	ldr	x19,[x30],#8		// *K++, x28 in next round
233	//add	x22,x22,x17			// h+=Sigma0(a)
234#ifndef	__AARCH64EB__
235	rev	x9,x9			// 6
236#endif
237	add	x22,x22,x17			// h+=Sigma0(a)
238	ror	x16,x26,#14
239	add	x21,x21,x19			// h+=K[i]
240	eor	x12,x26,x26,ror#23
241	and	x17,x27,x26
242	bic	x19,x20,x26
243	add	x21,x21,x9			// h+=X[i]
244	orr	x17,x17,x19			// Ch(e,f,g)
245	eor	x19,x22,x23			// a^b, b^c in next round
246	eor	x16,x16,x12,ror#18	// Sigma1(e)
247	ror	x12,x22,#28
248	add	x21,x21,x17			// h+=Ch(e,f,g)
249	eor	x17,x22,x22,ror#5
250	add	x21,x21,x16			// h+=Sigma1(e)
251	and	x28,x28,x19			// (b^c)&=(a^b)
252	add	x25,x25,x21			// d+=h
253	eor	x28,x28,x23			// Maj(a,b,c)
254	eor	x17,x12,x17,ror#34	// Sigma0(a)
255	add	x21,x21,x28			// h+=Maj(a,b,c)
256	ldr	x28,[x30],#8		// *K++, x19 in next round
257	//add	x21,x21,x17			// h+=Sigma0(a)
258#ifndef	__AARCH64EB__
259	rev	x10,x10			// 7
260#endif
261	ldp	x11,x12,[x1],#2*8
262	add	x21,x21,x17			// h+=Sigma0(a)
263	ror	x16,x25,#14
264	add	x20,x20,x28			// h+=K[i]
265	eor	x13,x25,x25,ror#23
266	and	x17,x26,x25
267	bic	x28,x27,x25
268	add	x20,x20,x10			// h+=X[i]
269	orr	x17,x17,x28			// Ch(e,f,g)
270	eor	x28,x21,x22			// a^b, b^c in next round
271	eor	x16,x16,x13,ror#18	// Sigma1(e)
272	ror	x13,x21,#28
273	add	x20,x20,x17			// h+=Ch(e,f,g)
274	eor	x17,x21,x21,ror#5
275	add	x20,x20,x16			// h+=Sigma1(e)
276	and	x19,x19,x28			// (b^c)&=(a^b)
277	add	x24,x24,x20			// d+=h
278	eor	x19,x19,x22			// Maj(a,b,c)
279	eor	x17,x13,x17,ror#34	// Sigma0(a)
280	add	x20,x20,x19			// h+=Maj(a,b,c)
281	ldr	x19,[x30],#8		// *K++, x28 in next round
282	//add	x20,x20,x17			// h+=Sigma0(a)
283#ifndef	__AARCH64EB__
284	rev	x11,x11			// 8
285#endif
286	add	x20,x20,x17			// h+=Sigma0(a)
287	ror	x16,x24,#14
288	add	x27,x27,x19			// h+=K[i]
289	eor	x14,x24,x24,ror#23
290	and	x17,x25,x24
291	bic	x19,x26,x24
292	add	x27,x27,x11			// h+=X[i]
293	orr	x17,x17,x19			// Ch(e,f,g)
294	eor	x19,x20,x21			// a^b, b^c in next round
295	eor	x16,x16,x14,ror#18	// Sigma1(e)
296	ror	x14,x20,#28
297	add	x27,x27,x17			// h+=Ch(e,f,g)
298	eor	x17,x20,x20,ror#5
299	add	x27,x27,x16			// h+=Sigma1(e)
300	and	x28,x28,x19			// (b^c)&=(a^b)
301	add	x23,x23,x27			// d+=h
302	eor	x28,x28,x21			// Maj(a,b,c)
303	eor	x17,x14,x17,ror#34	// Sigma0(a)
304	add	x27,x27,x28			// h+=Maj(a,b,c)
305	ldr	x28,[x30],#8		// *K++, x19 in next round
306	//add	x27,x27,x17			// h+=Sigma0(a)
307#ifndef	__AARCH64EB__
308	rev	x12,x12			// 9
309#endif
310	ldp	x13,x14,[x1],#2*8
311	add	x27,x27,x17			// h+=Sigma0(a)
312	ror	x16,x23,#14
313	add	x26,x26,x28			// h+=K[i]
314	eor	x15,x23,x23,ror#23
315	and	x17,x24,x23
316	bic	x28,x25,x23
317	add	x26,x26,x12			// h+=X[i]
318	orr	x17,x17,x28			// Ch(e,f,g)
319	eor	x28,x27,x20			// a^b, b^c in next round
320	eor	x16,x16,x15,ror#18	// Sigma1(e)
321	ror	x15,x27,#28
322	add	x26,x26,x17			// h+=Ch(e,f,g)
323	eor	x17,x27,x27,ror#5
324	add	x26,x26,x16			// h+=Sigma1(e)
325	and	x19,x19,x28			// (b^c)&=(a^b)
326	add	x22,x22,x26			// d+=h
327	eor	x19,x19,x20			// Maj(a,b,c)
328	eor	x17,x15,x17,ror#34	// Sigma0(a)
329	add	x26,x26,x19			// h+=Maj(a,b,c)
330	ldr	x19,[x30],#8		// *K++, x28 in next round
331	//add	x26,x26,x17			// h+=Sigma0(a)
332#ifndef	__AARCH64EB__
333	rev	x13,x13			// 10
334#endif
335	add	x26,x26,x17			// h+=Sigma0(a)
336	ror	x16,x22,#14
337	add	x25,x25,x19			// h+=K[i]
338	eor	x0,x22,x22,ror#23
339	and	x17,x23,x22
340	bic	x19,x24,x22
341	add	x25,x25,x13			// h+=X[i]
342	orr	x17,x17,x19			// Ch(e,f,g)
343	eor	x19,x26,x27			// a^b, b^c in next round
344	eor	x16,x16,x0,ror#18	// Sigma1(e)
345	ror	x0,x26,#28
346	add	x25,x25,x17			// h+=Ch(e,f,g)
347	eor	x17,x26,x26,ror#5
348	add	x25,x25,x16			// h+=Sigma1(e)
349	and	x28,x28,x19			// (b^c)&=(a^b)
350	add	x21,x21,x25			// d+=h
351	eor	x28,x28,x27			// Maj(a,b,c)
352	eor	x17,x0,x17,ror#34	// Sigma0(a)
353	add	x25,x25,x28			// h+=Maj(a,b,c)
354	ldr	x28,[x30],#8		// *K++, x19 in next round
355	//add	x25,x25,x17			// h+=Sigma0(a)
356#ifndef	__AARCH64EB__
357	rev	x14,x14			// 11
358#endif
359	ldp	x15,x0,[x1],#2*8
360	add	x25,x25,x17			// h+=Sigma0(a)
361	str	x6,[sp,#24]
362	ror	x16,x21,#14
363	add	x24,x24,x28			// h+=K[i]
364	eor	x6,x21,x21,ror#23
365	and	x17,x22,x21
366	bic	x28,x23,x21
367	add	x24,x24,x14			// h+=X[i]
368	orr	x17,x17,x28			// Ch(e,f,g)
369	eor	x28,x25,x26			// a^b, b^c in next round
370	eor	x16,x16,x6,ror#18	// Sigma1(e)
371	ror	x6,x25,#28
372	add	x24,x24,x17			// h+=Ch(e,f,g)
373	eor	x17,x25,x25,ror#5
374	add	x24,x24,x16			// h+=Sigma1(e)
375	and	x19,x19,x28			// (b^c)&=(a^b)
376	add	x20,x20,x24			// d+=h
377	eor	x19,x19,x26			// Maj(a,b,c)
378	eor	x17,x6,x17,ror#34	// Sigma0(a)
379	add	x24,x24,x19			// h+=Maj(a,b,c)
380	ldr	x19,[x30],#8		// *K++, x28 in next round
381	//add	x24,x24,x17			// h+=Sigma0(a)
382#ifndef	__AARCH64EB__
383	rev	x15,x15			// 12
384#endif
385	add	x24,x24,x17			// h+=Sigma0(a)
386	str	x7,[sp,#0]
387	ror	x16,x20,#14
388	add	x23,x23,x19			// h+=K[i]
389	eor	x7,x20,x20,ror#23
390	and	x17,x21,x20
391	bic	x19,x22,x20
392	add	x23,x23,x15			// h+=X[i]
393	orr	x17,x17,x19			// Ch(e,f,g)
394	eor	x19,x24,x25			// a^b, b^c in next round
395	eor	x16,x16,x7,ror#18	// Sigma1(e)
396	ror	x7,x24,#28
397	add	x23,x23,x17			// h+=Ch(e,f,g)
398	eor	x17,x24,x24,ror#5
399	add	x23,x23,x16			// h+=Sigma1(e)
400	and	x28,x28,x19			// (b^c)&=(a^b)
401	add	x27,x27,x23			// d+=h
402	eor	x28,x28,x25			// Maj(a,b,c)
403	eor	x17,x7,x17,ror#34	// Sigma0(a)
404	add	x23,x23,x28			// h+=Maj(a,b,c)
405	ldr	x28,[x30],#8		// *K++, x19 in next round
406	//add	x23,x23,x17			// h+=Sigma0(a)
407#ifndef	__AARCH64EB__
408	rev	x0,x0			// 13
409#endif
410	ldp	x1,x2,[x1]
411	add	x23,x23,x17			// h+=Sigma0(a)
412	str	x8,[sp,#8]
413	ror	x16,x27,#14
414	add	x22,x22,x28			// h+=K[i]
415	eor	x8,x27,x27,ror#23
416	and	x17,x20,x27
417	bic	x28,x21,x27
418	add	x22,x22,x0			// h+=X[i]
419	orr	x17,x17,x28			// Ch(e,f,g)
420	eor	x28,x23,x24			// a^b, b^c in next round
421	eor	x16,x16,x8,ror#18	// Sigma1(e)
422	ror	x8,x23,#28
423	add	x22,x22,x17			// h+=Ch(e,f,g)
424	eor	x17,x23,x23,ror#5
425	add	x22,x22,x16			// h+=Sigma1(e)
426	and	x19,x19,x28			// (b^c)&=(a^b)
427	add	x26,x26,x22			// d+=h
428	eor	x19,x19,x24			// Maj(a,b,c)
429	eor	x17,x8,x17,ror#34	// Sigma0(a)
430	add	x22,x22,x19			// h+=Maj(a,b,c)
431	ldr	x19,[x30],#8		// *K++, x28 in next round
432	//add	x22,x22,x17			// h+=Sigma0(a)
433#ifndef	__AARCH64EB__
434	rev	x1,x1			// 14
435#endif
436	ldr	x6,[sp,#24]
437	add	x22,x22,x17			// h+=Sigma0(a)
438	str	x9,[sp,#16]
439	ror	x16,x26,#14
440	add	x21,x21,x19			// h+=K[i]
441	eor	x9,x26,x26,ror#23
442	and	x17,x27,x26
443	bic	x19,x20,x26
444	add	x21,x21,x1			// h+=X[i]
445	orr	x17,x17,x19			// Ch(e,f,g)
446	eor	x19,x22,x23			// a^b, b^c in next round
447	eor	x16,x16,x9,ror#18	// Sigma1(e)
448	ror	x9,x22,#28
449	add	x21,x21,x17			// h+=Ch(e,f,g)
450	eor	x17,x22,x22,ror#5
451	add	x21,x21,x16			// h+=Sigma1(e)
452	and	x28,x28,x19			// (b^c)&=(a^b)
453	add	x25,x25,x21			// d+=h
454	eor	x28,x28,x23			// Maj(a,b,c)
455	eor	x17,x9,x17,ror#34	// Sigma0(a)
456	add	x21,x21,x28			// h+=Maj(a,b,c)
457	ldr	x28,[x30],#8		// *K++, x19 in next round
458	//add	x21,x21,x17			// h+=Sigma0(a)
459#ifndef	__AARCH64EB__
460	rev	x2,x2			// 15
461#endif
462	ldr	x7,[sp,#0]
463	add	x21,x21,x17			// h+=Sigma0(a)
464	str	x10,[sp,#24]
465	ror	x16,x25,#14
466	add	x20,x20,x28			// h+=K[i]
467	ror	x9,x4,#1
468	and	x17,x26,x25
469	ror	x8,x1,#19
470	bic	x28,x27,x25
471	ror	x10,x21,#28
472	add	x20,x20,x2			// h+=X[i]
473	eor	x16,x16,x25,ror#18
474	eor	x9,x9,x4,ror#8
475	orr	x17,x17,x28			// Ch(e,f,g)
476	eor	x28,x21,x22			// a^b, b^c in next round
477	eor	x16,x16,x25,ror#41	// Sigma1(e)
478	eor	x10,x10,x21,ror#34
479	add	x20,x20,x17			// h+=Ch(e,f,g)
480	and	x19,x19,x28			// (b^c)&=(a^b)
481	eor	x8,x8,x1,ror#61
482	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
483	add	x20,x20,x16			// h+=Sigma1(e)
484	eor	x19,x19,x22			// Maj(a,b,c)
485	eor	x17,x10,x21,ror#39	// Sigma0(a)
486	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
487	add	x3,x3,x12
488	add	x24,x24,x20			// d+=h
489	add	x20,x20,x19			// h+=Maj(a,b,c)
490	ldr	x19,[x30],#8		// *K++, x28 in next round
491	add	x3,x3,x9
492	add	x20,x20,x17			// h+=Sigma0(a)
493	add	x3,x3,x8
494.Loop_16_xx:
495	ldr	x8,[sp,#8]
496	str	x11,[sp,#0]
497	ror	x16,x24,#14
498	add	x27,x27,x19			// h+=K[i]
499	ror	x10,x5,#1
500	and	x17,x25,x24
501	ror	x9,x2,#19
502	bic	x19,x26,x24
503	ror	x11,x20,#28
504	add	x27,x27,x3			// h+=X[i]
505	eor	x16,x16,x24,ror#18
506	eor	x10,x10,x5,ror#8
507	orr	x17,x17,x19			// Ch(e,f,g)
508	eor	x19,x20,x21			// a^b, b^c in next round
509	eor	x16,x16,x24,ror#41	// Sigma1(e)
510	eor	x11,x11,x20,ror#34
511	add	x27,x27,x17			// h+=Ch(e,f,g)
512	and	x28,x28,x19			// (b^c)&=(a^b)
513	eor	x9,x9,x2,ror#61
514	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
515	add	x27,x27,x16			// h+=Sigma1(e)
516	eor	x28,x28,x21			// Maj(a,b,c)
517	eor	x17,x11,x20,ror#39	// Sigma0(a)
518	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
519	add	x4,x4,x13
520	add	x23,x23,x27			// d+=h
521	add	x27,x27,x28			// h+=Maj(a,b,c)
522	ldr	x28,[x30],#8		// *K++, x19 in next round
523	add	x4,x4,x10
524	add	x27,x27,x17			// h+=Sigma0(a)
525	add	x4,x4,x9
526	ldr	x9,[sp,#16]
527	str	x12,[sp,#8]
528	ror	x16,x23,#14
529	add	x26,x26,x28			// h+=K[i]
530	ror	x11,x6,#1
531	and	x17,x24,x23
532	ror	x10,x3,#19
533	bic	x28,x25,x23
534	ror	x12,x27,#28
535	add	x26,x26,x4			// h+=X[i]
536	eor	x16,x16,x23,ror#18
537	eor	x11,x11,x6,ror#8
538	orr	x17,x17,x28			// Ch(e,f,g)
539	eor	x28,x27,x20			// a^b, b^c in next round
540	eor	x16,x16,x23,ror#41	// Sigma1(e)
541	eor	x12,x12,x27,ror#34
542	add	x26,x26,x17			// h+=Ch(e,f,g)
543	and	x19,x19,x28			// (b^c)&=(a^b)
544	eor	x10,x10,x3,ror#61
545	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
546	add	x26,x26,x16			// h+=Sigma1(e)
547	eor	x19,x19,x20			// Maj(a,b,c)
548	eor	x17,x12,x27,ror#39	// Sigma0(a)
549	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
550	add	x5,x5,x14
551	add	x22,x22,x26			// d+=h
552	add	x26,x26,x19			// h+=Maj(a,b,c)
553	ldr	x19,[x30],#8		// *K++, x28 in next round
554	add	x5,x5,x11
555	add	x26,x26,x17			// h+=Sigma0(a)
556	add	x5,x5,x10
557	ldr	x10,[sp,#24]
558	str	x13,[sp,#16]
559	ror	x16,x22,#14
560	add	x25,x25,x19			// h+=K[i]
561	ror	x12,x7,#1
562	and	x17,x23,x22
563	ror	x11,x4,#19
564	bic	x19,x24,x22
565	ror	x13,x26,#28
566	add	x25,x25,x5			// h+=X[i]
567	eor	x16,x16,x22,ror#18
568	eor	x12,x12,x7,ror#8
569	orr	x17,x17,x19			// Ch(e,f,g)
570	eor	x19,x26,x27			// a^b, b^c in next round
571	eor	x16,x16,x22,ror#41	// Sigma1(e)
572	eor	x13,x13,x26,ror#34
573	add	x25,x25,x17			// h+=Ch(e,f,g)
574	and	x28,x28,x19			// (b^c)&=(a^b)
575	eor	x11,x11,x4,ror#61
576	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
577	add	x25,x25,x16			// h+=Sigma1(e)
578	eor	x28,x28,x27			// Maj(a,b,c)
579	eor	x17,x13,x26,ror#39	// Sigma0(a)
580	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
581	add	x6,x6,x15
582	add	x21,x21,x25			// d+=h
583	add	x25,x25,x28			// h+=Maj(a,b,c)
584	ldr	x28,[x30],#8		// *K++, x19 in next round
585	add	x6,x6,x12
586	add	x25,x25,x17			// h+=Sigma0(a)
587	add	x6,x6,x11
588	ldr	x11,[sp,#0]
589	str	x14,[sp,#24]
590	ror	x16,x21,#14
591	add	x24,x24,x28			// h+=K[i]
592	ror	x13,x8,#1
593	and	x17,x22,x21
594	ror	x12,x5,#19
595	bic	x28,x23,x21
596	ror	x14,x25,#28
597	add	x24,x24,x6			// h+=X[i]
598	eor	x16,x16,x21,ror#18
599	eor	x13,x13,x8,ror#8
600	orr	x17,x17,x28			// Ch(e,f,g)
601	eor	x28,x25,x26			// a^b, b^c in next round
602	eor	x16,x16,x21,ror#41	// Sigma1(e)
603	eor	x14,x14,x25,ror#34
604	add	x24,x24,x17			// h+=Ch(e,f,g)
605	and	x19,x19,x28			// (b^c)&=(a^b)
606	eor	x12,x12,x5,ror#61
607	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
608	add	x24,x24,x16			// h+=Sigma1(e)
609	eor	x19,x19,x26			// Maj(a,b,c)
610	eor	x17,x14,x25,ror#39	// Sigma0(a)
611	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
612	add	x7,x7,x0
613	add	x20,x20,x24			// d+=h
614	add	x24,x24,x19			// h+=Maj(a,b,c)
615	ldr	x19,[x30],#8		// *K++, x28 in next round
616	add	x7,x7,x13
617	add	x24,x24,x17			// h+=Sigma0(a)
618	add	x7,x7,x12
619	ldr	x12,[sp,#8]
620	str	x15,[sp,#0]
621	ror	x16,x20,#14
622	add	x23,x23,x19			// h+=K[i]
623	ror	x14,x9,#1
624	and	x17,x21,x20
625	ror	x13,x6,#19
626	bic	x19,x22,x20
627	ror	x15,x24,#28
628	add	x23,x23,x7			// h+=X[i]
629	eor	x16,x16,x20,ror#18
630	eor	x14,x14,x9,ror#8
631	orr	x17,x17,x19			// Ch(e,f,g)
632	eor	x19,x24,x25			// a^b, b^c in next round
633	eor	x16,x16,x20,ror#41	// Sigma1(e)
634	eor	x15,x15,x24,ror#34
635	add	x23,x23,x17			// h+=Ch(e,f,g)
636	and	x28,x28,x19			// (b^c)&=(a^b)
637	eor	x13,x13,x6,ror#61
638	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
639	add	x23,x23,x16			// h+=Sigma1(e)
640	eor	x28,x28,x25			// Maj(a,b,c)
641	eor	x17,x15,x24,ror#39	// Sigma0(a)
642	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
643	add	x8,x8,x1
644	add	x27,x27,x23			// d+=h
645	add	x23,x23,x28			// h+=Maj(a,b,c)
646	ldr	x28,[x30],#8		// *K++, x19 in next round
647	add	x8,x8,x14
648	add	x23,x23,x17			// h+=Sigma0(a)
649	add	x8,x8,x13
650	ldr	x13,[sp,#16]
651	str	x0,[sp,#8]
652	ror	x16,x27,#14
653	add	x22,x22,x28			// h+=K[i]
654	ror	x15,x10,#1
655	and	x17,x20,x27
656	ror	x14,x7,#19
657	bic	x28,x21,x27
658	ror	x0,x23,#28
659	add	x22,x22,x8			// h+=X[i]
660	eor	x16,x16,x27,ror#18
661	eor	x15,x15,x10,ror#8
662	orr	x17,x17,x28			// Ch(e,f,g)
663	eor	x28,x23,x24			// a^b, b^c in next round
664	eor	x16,x16,x27,ror#41	// Sigma1(e)
665	eor	x0,x0,x23,ror#34
666	add	x22,x22,x17			// h+=Ch(e,f,g)
667	and	x19,x19,x28			// (b^c)&=(a^b)
668	eor	x14,x14,x7,ror#61
669	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
670	add	x22,x22,x16			// h+=Sigma1(e)
671	eor	x19,x19,x24			// Maj(a,b,c)
672	eor	x17,x0,x23,ror#39	// Sigma0(a)
673	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
674	add	x9,x9,x2
675	add	x26,x26,x22			// d+=h
676	add	x22,x22,x19			// h+=Maj(a,b,c)
677	ldr	x19,[x30],#8		// *K++, x28 in next round
678	add	x9,x9,x15
679	add	x22,x22,x17			// h+=Sigma0(a)
680	add	x9,x9,x14
681	ldr	x14,[sp,#24]
682	str	x1,[sp,#16]
683	ror	x16,x26,#14
684	add	x21,x21,x19			// h+=K[i]
685	ror	x0,x11,#1
686	and	x17,x27,x26
687	ror	x15,x8,#19
688	bic	x19,x20,x26
689	ror	x1,x22,#28
690	add	x21,x21,x9			// h+=X[i]
691	eor	x16,x16,x26,ror#18
692	eor	x0,x0,x11,ror#8
693	orr	x17,x17,x19			// Ch(e,f,g)
694	eor	x19,x22,x23			// a^b, b^c in next round
695	eor	x16,x16,x26,ror#41	// Sigma1(e)
696	eor	x1,x1,x22,ror#34
697	add	x21,x21,x17			// h+=Ch(e,f,g)
698	and	x28,x28,x19			// (b^c)&=(a^b)
699	eor	x15,x15,x8,ror#61
700	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
701	add	x21,x21,x16			// h+=Sigma1(e)
702	eor	x28,x28,x23			// Maj(a,b,c)
703	eor	x17,x1,x22,ror#39	// Sigma0(a)
704	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
705	add	x10,x10,x3
706	add	x25,x25,x21			// d+=h
707	add	x21,x21,x28			// h+=Maj(a,b,c)
708	ldr	x28,[x30],#8		// *K++, x19 in next round
709	add	x10,x10,x0
710	add	x21,x21,x17			// h+=Sigma0(a)
711	add	x10,x10,x15
712	ldr	x15,[sp,#0]
713	str	x2,[sp,#24]
714	ror	x16,x25,#14
715	add	x20,x20,x28			// h+=K[i]
716	ror	x1,x12,#1
717	and	x17,x26,x25
718	ror	x0,x9,#19
719	bic	x28,x27,x25
720	ror	x2,x21,#28
721	add	x20,x20,x10			// h+=X[i]
722	eor	x16,x16,x25,ror#18
723	eor	x1,x1,x12,ror#8
724	orr	x17,x17,x28			// Ch(e,f,g)
725	eor	x28,x21,x22			// a^b, b^c in next round
726	eor	x16,x16,x25,ror#41	// Sigma1(e)
727	eor	x2,x2,x21,ror#34
728	add	x20,x20,x17			// h+=Ch(e,f,g)
729	and	x19,x19,x28			// (b^c)&=(a^b)
730	eor	x0,x0,x9,ror#61
731	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
732	add	x20,x20,x16			// h+=Sigma1(e)
733	eor	x19,x19,x22			// Maj(a,b,c)
734	eor	x17,x2,x21,ror#39	// Sigma0(a)
735	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
736	add	x11,x11,x4
737	add	x24,x24,x20			// d+=h
738	add	x20,x20,x19			// h+=Maj(a,b,c)
739	ldr	x19,[x30],#8		// *K++, x28 in next round
740	add	x11,x11,x1
741	add	x20,x20,x17			// h+=Sigma0(a)
742	add	x11,x11,x0
743	ldr	x0,[sp,#8]
744	str	x3,[sp,#0]
745	ror	x16,x24,#14
746	add	x27,x27,x19			// h+=K[i]
747	ror	x2,x13,#1
748	and	x17,x25,x24
749	ror	x1,x10,#19
750	bic	x19,x26,x24
751	ror	x3,x20,#28
752	add	x27,x27,x11			// h+=X[i]
753	eor	x16,x16,x24,ror#18
754	eor	x2,x2,x13,ror#8
755	orr	x17,x17,x19			// Ch(e,f,g)
756	eor	x19,x20,x21			// a^b, b^c in next round
757	eor	x16,x16,x24,ror#41	// Sigma1(e)
758	eor	x3,x3,x20,ror#34
759	add	x27,x27,x17			// h+=Ch(e,f,g)
760	and	x28,x28,x19			// (b^c)&=(a^b)
761	eor	x1,x1,x10,ror#61
762	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
763	add	x27,x27,x16			// h+=Sigma1(e)
764	eor	x28,x28,x21			// Maj(a,b,c)
765	eor	x17,x3,x20,ror#39	// Sigma0(a)
766	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
767	add	x12,x12,x5
768	add	x23,x23,x27			// d+=h
769	add	x27,x27,x28			// h+=Maj(a,b,c)
770	ldr	x28,[x30],#8		// *K++, x19 in next round
771	add	x12,x12,x2
772	add	x27,x27,x17			// h+=Sigma0(a)
773	add	x12,x12,x1
774	ldr	x1,[sp,#16]
775	str	x4,[sp,#8]
776	ror	x16,x23,#14
777	add	x26,x26,x28			// h+=K[i]
778	ror	x3,x14,#1
779	and	x17,x24,x23
780	ror	x2,x11,#19
781	bic	x28,x25,x23
782	ror	x4,x27,#28
783	add	x26,x26,x12			// h+=X[i]
784	eor	x16,x16,x23,ror#18
785	eor	x3,x3,x14,ror#8
786	orr	x17,x17,x28			// Ch(e,f,g)
787	eor	x28,x27,x20			// a^b, b^c in next round
788	eor	x16,x16,x23,ror#41	// Sigma1(e)
789	eor	x4,x4,x27,ror#34
790	add	x26,x26,x17			// h+=Ch(e,f,g)
791	and	x19,x19,x28			// (b^c)&=(a^b)
792	eor	x2,x2,x11,ror#61
793	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
794	add	x26,x26,x16			// h+=Sigma1(e)
795	eor	x19,x19,x20			// Maj(a,b,c)
796	eor	x17,x4,x27,ror#39	// Sigma0(a)
797	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
798	add	x13,x13,x6
799	add	x22,x22,x26			// d+=h
800	add	x26,x26,x19			// h+=Maj(a,b,c)
801	ldr	x19,[x30],#8		// *K++, x28 in next round
802	add	x13,x13,x3
803	add	x26,x26,x17			// h+=Sigma0(a)
804	add	x13,x13,x2
805	ldr	x2,[sp,#24]
806	str	x5,[sp,#16]
807	ror	x16,x22,#14
808	add	x25,x25,x19			// h+=K[i]
809	ror	x4,x15,#1
810	and	x17,x23,x22
811	ror	x3,x12,#19
812	bic	x19,x24,x22
813	ror	x5,x26,#28
814	add	x25,x25,x13			// h+=X[i]
815	eor	x16,x16,x22,ror#18
816	eor	x4,x4,x15,ror#8
817	orr	x17,x17,x19			// Ch(e,f,g)
818	eor	x19,x26,x27			// a^b, b^c in next round
819	eor	x16,x16,x22,ror#41	// Sigma1(e)
820	eor	x5,x5,x26,ror#34
821	add	x25,x25,x17			// h+=Ch(e,f,g)
822	and	x28,x28,x19			// (b^c)&=(a^b)
823	eor	x3,x3,x12,ror#61
824	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
825	add	x25,x25,x16			// h+=Sigma1(e)
826	eor	x28,x28,x27			// Maj(a,b,c)
827	eor	x17,x5,x26,ror#39	// Sigma0(a)
828	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
829	add	x14,x14,x7
830	add	x21,x21,x25			// d+=h
831	add	x25,x25,x28			// h+=Maj(a,b,c)
832	ldr	x28,[x30],#8		// *K++, x19 in next round
833	add	x14,x14,x4
834	add	x25,x25,x17			// h+=Sigma0(a)
835	add	x14,x14,x3
836	ldr	x3,[sp,#0]
837	str	x6,[sp,#24]
838	ror	x16,x21,#14
839	add	x24,x24,x28			// h+=K[i]
840	ror	x5,x0,#1
841	and	x17,x22,x21
842	ror	x4,x13,#19
843	bic	x28,x23,x21
844	ror	x6,x25,#28
845	add	x24,x24,x14			// h+=X[i]
846	eor	x16,x16,x21,ror#18
847	eor	x5,x5,x0,ror#8
848	orr	x17,x17,x28			// Ch(e,f,g)
849	eor	x28,x25,x26			// a^b, b^c in next round
850	eor	x16,x16,x21,ror#41	// Sigma1(e)
851	eor	x6,x6,x25,ror#34
852	add	x24,x24,x17			// h+=Ch(e,f,g)
853	and	x19,x19,x28			// (b^c)&=(a^b)
854	eor	x4,x4,x13,ror#61
855	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
856	add	x24,x24,x16			// h+=Sigma1(e)
857	eor	x19,x19,x26			// Maj(a,b,c)
858	eor	x17,x6,x25,ror#39	// Sigma0(a)
859	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
860	add	x15,x15,x8
861	add	x20,x20,x24			// d+=h
862	add	x24,x24,x19			// h+=Maj(a,b,c)
863	ldr	x19,[x30],#8		// *K++, x28 in next round
864	add	x15,x15,x5
865	add	x24,x24,x17			// h+=Sigma0(a)
866	add	x15,x15,x4
867	ldr	x4,[sp,#8]
868	str	x7,[sp,#0]
869	ror	x16,x20,#14
870	add	x23,x23,x19			// h+=K[i]
871	ror	x6,x1,#1
872	and	x17,x21,x20
873	ror	x5,x14,#19
874	bic	x19,x22,x20
875	ror	x7,x24,#28
876	add	x23,x23,x15			// h+=X[i]
877	eor	x16,x16,x20,ror#18
878	eor	x6,x6,x1,ror#8
879	orr	x17,x17,x19			// Ch(e,f,g)
880	eor	x19,x24,x25			// a^b, b^c in next round
881	eor	x16,x16,x20,ror#41	// Sigma1(e)
882	eor	x7,x7,x24,ror#34
883	add	x23,x23,x17			// h+=Ch(e,f,g)
884	and	x28,x28,x19			// (b^c)&=(a^b)
885	eor	x5,x5,x14,ror#61
886	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
887	add	x23,x23,x16			// h+=Sigma1(e)
888	eor	x28,x28,x25			// Maj(a,b,c)
889	eor	x17,x7,x24,ror#39	// Sigma0(a)
890	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
891	add	x0,x0,x9
892	add	x27,x27,x23			// d+=h
893	add	x23,x23,x28			// h+=Maj(a,b,c)
894	ldr	x28,[x30],#8		// *K++, x19 in next round
895	add	x0,x0,x6
896	add	x23,x23,x17			// h+=Sigma0(a)
897	add	x0,x0,x5
898	ldr	x5,[sp,#16]
899	str	x8,[sp,#8]
900	ror	x16,x27,#14
901	add	x22,x22,x28			// h+=K[i]
902	ror	x7,x2,#1
903	and	x17,x20,x27
904	ror	x6,x15,#19
905	bic	x28,x21,x27
906	ror	x8,x23,#28
907	add	x22,x22,x0			// h+=X[i]
908	eor	x16,x16,x27,ror#18
909	eor	x7,x7,x2,ror#8
910	orr	x17,x17,x28			// Ch(e,f,g)
911	eor	x28,x23,x24			// a^b, b^c in next round
912	eor	x16,x16,x27,ror#41	// Sigma1(e)
913	eor	x8,x8,x23,ror#34
914	add	x22,x22,x17			// h+=Ch(e,f,g)
915	and	x19,x19,x28			// (b^c)&=(a^b)
916	eor	x6,x6,x15,ror#61
917	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
918	add	x22,x22,x16			// h+=Sigma1(e)
919	eor	x19,x19,x24			// Maj(a,b,c)
920	eor	x17,x8,x23,ror#39	// Sigma0(a)
921	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
922	add	x1,x1,x10
923	add	x26,x26,x22			// d+=h
924	add	x22,x22,x19			// h+=Maj(a,b,c)
925	ldr	x19,[x30],#8		// *K++, x28 in next round
926	add	x1,x1,x7
927	add	x22,x22,x17			// h+=Sigma0(a)
928	add	x1,x1,x6
929	ldr	x6,[sp,#24]
930	str	x9,[sp,#16]
931	ror	x16,x26,#14
932	add	x21,x21,x19			// h+=K[i]
933	ror	x8,x3,#1
934	and	x17,x27,x26
935	ror	x7,x0,#19
936	bic	x19,x20,x26
937	ror	x9,x22,#28
938	add	x21,x21,x1			// h+=X[i]
939	eor	x16,x16,x26,ror#18
940	eor	x8,x8,x3,ror#8
941	orr	x17,x17,x19			// Ch(e,f,g)
942	eor	x19,x22,x23			// a^b, b^c in next round
943	eor	x16,x16,x26,ror#41	// Sigma1(e)
944	eor	x9,x9,x22,ror#34
945	add	x21,x21,x17			// h+=Ch(e,f,g)
946	and	x28,x28,x19			// (b^c)&=(a^b)
947	eor	x7,x7,x0,ror#61
948	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
949	add	x21,x21,x16			// h+=Sigma1(e)
950	eor	x28,x28,x23			// Maj(a,b,c)
951	eor	x17,x9,x22,ror#39	// Sigma0(a)
952	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
953	add	x2,x2,x11
954	add	x25,x25,x21			// d+=h
955	add	x21,x21,x28			// h+=Maj(a,b,c)
956	ldr	x28,[x30],#8		// *K++, x19 in next round
957	add	x2,x2,x8
958	add	x21,x21,x17			// h+=Sigma0(a)
959	add	x2,x2,x7
960	ldr	x7,[sp,#0]
961	str	x10,[sp,#24]
962	ror	x16,x25,#14
963	add	x20,x20,x28			// h+=K[i]
964	ror	x9,x4,#1
965	and	x17,x26,x25
966	ror	x8,x1,#19
967	bic	x28,x27,x25
968	ror	x10,x21,#28
969	add	x20,x20,x2			// h+=X[i]
970	eor	x16,x16,x25,ror#18
971	eor	x9,x9,x4,ror#8
972	orr	x17,x17,x28			// Ch(e,f,g)
973	eor	x28,x21,x22			// a^b, b^c in next round
974	eor	x16,x16,x25,ror#41	// Sigma1(e)
975	eor	x10,x10,x21,ror#34
976	add	x20,x20,x17			// h+=Ch(e,f,g)
977	and	x19,x19,x28			// (b^c)&=(a^b)
978	eor	x8,x8,x1,ror#61
979	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
980	add	x20,x20,x16			// h+=Sigma1(e)
981	eor	x19,x19,x22			// Maj(a,b,c)
982	eor	x17,x10,x21,ror#39	// Sigma0(a)
983	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
984	add	x3,x3,x12
985	add	x24,x24,x20			// d+=h
986	add	x20,x20,x19			// h+=Maj(a,b,c)
987	ldr	x19,[x30],#8		// *K++, x28 in next round
988	add	x3,x3,x9
989	add	x20,x20,x17			// h+=Sigma0(a)
990	add	x3,x3,x8
991	cbnz	x19,.Loop_16_xx
992
993	ldp	x0,x2,[x29,#96]
994	ldr	x1,[x29,#112]
995	sub	x30,x30,#648		// rewind
996
997	ldp	x3,x4,[x0]
998	ldp	x5,x6,[x0,#2*8]
999	add	x1,x1,#14*8			// advance input pointer
1000	ldp	x7,x8,[x0,#4*8]
1001	add	x20,x20,x3
1002	ldp	x9,x10,[x0,#6*8]
1003	add	x21,x21,x4
1004	add	x22,x22,x5
1005	add	x23,x23,x6
1006	stp	x20,x21,[x0]
1007	add	x24,x24,x7
1008	add	x25,x25,x8
1009	stp	x22,x23,[x0,#2*8]
1010	add	x26,x26,x9
1011	add	x27,x27,x10
1012	cmp	x1,x2
1013	stp	x24,x25,[x0,#4*8]
1014	stp	x26,x27,[x0,#6*8]
1015	b.ne	.Loop
1016
1017	ldp	x19,x20,[x29,#16]
1018	add	sp,sp,#4*8
1019	ldp	x21,x22,[x29,#32]
1020	ldp	x23,x24,[x29,#48]
1021	ldp	x25,x26,[x29,#64]
1022	ldp	x27,x28,[x29,#80]
1023	ldp	x29,x30,[sp],#128
1024	ret
1025.size	sha512_block_data_order,.-sha512_block_data_order
1026
1027.align	6
1028.type	.LK512,%object
1029.LK512:
1030	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1031	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1032	.quad	0x3956c25bf348b538,0x59f111f1b605d019
1033	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1034	.quad	0xd807aa98a3030242,0x12835b0145706fbe
1035	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1036	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1037	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1038	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1039	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1040	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1041	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1042	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1043	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1044	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1045	.quad	0x06ca6351e003826f,0x142929670a0e6e70
1046	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1047	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1048	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1049	.quad	0x81c2c92e47edaee6,0x92722c851482353b
1050	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1051	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1052	.quad	0xd192e819d6ef5218,0xd69906245565a910
1053	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1054	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1055	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1056	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1057	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1058	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1059	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1060	.quad	0x90befffa23631e28,0xa4506cebde82bde9
1061	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1062	.quad	0xca273eceea26619c,0xd186b8c721c0c207
1063	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1064	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1065	.quad	0x113f9804bef90dae,0x1b710b35131c471b
1066	.quad	0x28db77f523047d84,0x32caab7b40c72493
1067	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1068	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1069	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1070	.quad	0	// terminator
1071.size	.LK512,.-.LK512
1072#ifndef	__KERNEL__
1073.align	3
1074.LOPENSSL_armcap_P:
1075# ifdef	__ILP32__
1076	.long	OPENSSL_armcap_P-.
1077# else
1078	.quad	OPENSSL_armcap_P-.
1079# endif
1080#endif
1081.asciz	"SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
1082.align	2
1083#ifndef	__KERNEL__
1084.comm	OPENSSL_armcap_P,4,4
1085#endif
1086