• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3.text
4
5#if defined(__thumb2__)
6.syntax	unified
7.thumb
8#else
9.code	32
10#endif
11
12.type	iotas32, %object
13.align	5
14iotas32:
15	.long	0x00000001, 0x00000000
16	.long	0x00000000, 0x00000089
17	.long	0x00000000, 0x8000008b
18	.long	0x00000000, 0x80008080
19	.long	0x00000001, 0x0000008b
20	.long	0x00000001, 0x00008000
21	.long	0x00000001, 0x80008088
22	.long	0x00000001, 0x80000082
23	.long	0x00000000, 0x0000000b
24	.long	0x00000000, 0x0000000a
25	.long	0x00000001, 0x00008082
26	.long	0x00000000, 0x00008003
27	.long	0x00000001, 0x0000808b
28	.long	0x00000001, 0x8000000b
29	.long	0x00000001, 0x8000008a
30	.long	0x00000001, 0x80000081
31	.long	0x00000000, 0x80000081
32	.long	0x00000000, 0x80000008
33	.long	0x00000000, 0x00000083
34	.long	0x00000000, 0x80008003
35	.long	0x00000001, 0x80008088
36	.long	0x00000000, 0x80000088
37	.long	0x00000001, 0x00008000
38	.long	0x00000000, 0x80008082
39.size	iotas32,.-iotas32
40
41.type	KeccakF1600_int, %function
42.align	5
43KeccakF1600_int:
44	add	r9,sp,#176
45	add	r12,sp,#0
46	add	r10,sp,#40
47	ldmia	r9,{r4-r9}		@ A[4][2..4]
48KeccakF1600_enter:
49	str	lr,[sp,#440]
50	eor	r11,r11,r11
51	str	r11,[sp,#444]
52	b	.Lround2x
53
54.align	4
55.Lround2x:
56	ldmia	r12,{r0-r3}		@ A[0][0..1]
57	ldmia	r10,{r10-r12,r14}	@ A[1][0..1]
58#ifdef	__thumb2__
59	eor	r0,r0,r10
60	eor	r1,r1,r11
61	eor	r2,r2,r12
62	ldrd	r10,r11,[sp,#56]
63	eor	r3,r3,r14
64	ldrd	r12,r14,[sp,#64]
65	eor	r4,r4,r10
66	eor	r5,r5,r11
67	eor	r6,r6,r12
68	ldrd	r10,r11,[sp,#72]
69	eor	r7,r7,r14
70	ldrd	r12,r14,[sp,#80]
71	eor	r8,r8,r10
72	eor	r9,r9,r11
73	eor	r0,r0,r12
74	ldrd	r10,r11,[sp,#88]
75	eor	r1,r1,r14
76	ldrd	r12,r14,[sp,#96]
77	eor	r2,r2,r10
78	eor	r3,r3,r11
79	eor	r4,r4,r12
80	ldrd	r10,r11,[sp,#104]
81	eor	r5,r5,r14
82	ldrd	r12,r14,[sp,#112]
83	eor	r6,r6,r10
84	eor	r7,r7,r11
85	eor	r8,r8,r12
86	ldrd	r10,r11,[sp,#120]
87	eor	r9,r9,r14
88	ldrd	r12,r14,[sp,#128]
89	eor	r0,r0,r10
90	eor	r1,r1,r11
91	eor	r2,r2,r12
92	ldrd	r10,r11,[sp,#136]
93	eor	r3,r3,r14
94	ldrd	r12,r14,[sp,#144]
95	eor	r4,r4,r10
96	eor	r5,r5,r11
97	eor	r6,r6,r12
98	ldrd	r10,r11,[sp,#152]
99	eor	r7,r7,r14
100	ldrd	r12,r14,[sp,#160]
101	eor	r8,r8,r10
102	eor	r9,r9,r11
103	eor	r0,r0,r12
104	ldrd	r10,r11,[sp,#168]
105	eor	r1,r1,r14
106	ldrd	r12,r14,[sp,#16]
107	eor	r2,r2,r10
108	eor	r3,r3,r11
109	eor	r4,r4,r12
110	ldrd	r10,r11,[sp,#24]
111	eor	r5,r5,r14
112	ldrd	r12,r14,[sp,#32]
113#else
114	eor	r0,r0,r10
115	 add	r10,sp,#56
116	eor	r1,r1,r11
117	eor	r2,r2,r12
118	eor	r3,r3,r14
119	ldmia	r10,{r10-r12,r14}	@ A[1][2..3]
120	eor	r4,r4,r10
121	 add	r10,sp,#72
122	eor	r5,r5,r11
123	eor	r6,r6,r12
124	eor	r7,r7,r14
125	ldmia	r10,{r10-r12,r14}	@ A[1][4]..A[2][0]
126	eor	r8,r8,r10
127	 add	r10,sp,#88
128	eor	r9,r9,r11
129	eor	r0,r0,r12
130	eor	r1,r1,r14
131	ldmia	r10,{r10-r12,r14}	@ A[2][1..2]
132	eor	r2,r2,r10
133	 add	r10,sp,#104
134	eor	r3,r3,r11
135	eor	r4,r4,r12
136	eor	r5,r5,r14
137	ldmia	r10,{r10-r12,r14}	@ A[2][3..4]
138	eor	r6,r6,r10
139	 add	r10,sp,#120
140	eor	r7,r7,r11
141	eor	r8,r8,r12
142	eor	r9,r9,r14
143	ldmia	r10,{r10-r12,r14}	@ A[3][0..1]
144	eor	r0,r0,r10
145	 add	r10,sp,#136
146	eor	r1,r1,r11
147	eor	r2,r2,r12
148	eor	r3,r3,r14
149	ldmia	r10,{r10-r12,r14}	@ A[3][2..3]
150	eor	r4,r4,r10
151	 add	r10,sp,#152
152	eor	r5,r5,r11
153	eor	r6,r6,r12
154	eor	r7,r7,r14
155	ldmia	r10,{r10-r12,r14}	@ A[3][4]..A[4][0]
156	eor	r8,r8,r10
157	ldr	r10,[sp,#168]		@ A[4][1]
158	eor	r9,r9,r11
159	ldr	r11,[sp,#168+4]
160	eor	r0,r0,r12
161	ldr	r12,[sp,#16]		@ A[0][2]
162	eor	r1,r1,r14
163	ldr	r14,[sp,#16+4]
164	eor	r2,r2,r10
165	 add	r10,sp,#24
166	eor	r3,r3,r11
167	eor	r4,r4,r12
168	eor	r5,r5,r14
169	ldmia	r10,{r10-r12,r14}	@ A[0][3..4]
170#endif
171	eor	r6,r6,r10
172	eor	r7,r7,r11
173	eor	r8,r8,r12
174	eor	r9,r9,r14
175
176	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
177#ifndef	__thumb2__
178	str	r10,[sp,#208]		@ D[1] = E[0]
179#endif
180	eor	r11,r1,r4
181#ifndef	__thumb2__
182	str	r11,[sp,#208+4]
183#else
184	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
185#endif
186	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
187	eor	r14,r7,r0
188#ifndef	__thumb2__
189	str	r12,[sp,#232]		@ D[4] = E[1]
190#endif
191	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
192#ifndef	__thumb2__
193	str	r14,[sp,#232+4]
194#else
195	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
196#endif
197	eor	r1,r9,r2
198#ifndef	__thumb2__
199	str	r0,[sp,#200]		@ D[0] = C[0]
200#endif
201	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
202#ifndef	__thumb2__
203	ldr	r7,[sp,#144]
204#endif
205	eor	r3,r3,r6
206#ifndef	__thumb2__
207	str	r1,[sp,#200+4]
208#else
209	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
210#endif
211#ifndef	__thumb2__
212	ldr	r6,[sp,#144+4]
213#else
214	ldrd	r7,r6,[sp,#144]
215#endif
216#ifndef	__thumb2__
217	str	r2,[sp,#216]		@ D[2] = C[1]
218#endif
219	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
220#ifndef	__thumb2__
221	str	r3,[sp,#216+4]
222#else
223	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
224#endif
225	eor	r5,r5,r8
226
227#ifndef	__thumb2__
228	ldr	r8,[sp,#192]
229#endif
230#ifndef	__thumb2__
231	ldr	r9,[sp,#192+4]
232#else
233	ldrd	r8,r9,[sp,#192]
234#endif
235#ifndef	__thumb2__
236	str	r4,[sp,#224]		@ D[3] = C[2]
237#endif
238	eor	r7,r7,r4
239#ifndef	__thumb2__
240	str	r5,[sp,#224+4]
241#else
242	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
243#endif
244	eor	r6,r6,r5
245#ifndef	__thumb2__
246	ldr	r4,[sp,#0]
247#endif
248	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
249	@ mov	r6,r6,ror#32-11
250#ifndef	__thumb2__
251	ldr	r5,[sp,#0+4]
252#else
253	ldrd	r4,r5,[sp,#0]
254#endif
255	eor	r8,r8,r12
256	eor	r9,r9,r14
257#ifndef	__thumb2__
258	ldr	r12,[sp,#96]
259#endif
260	eor	r0,r0,r4
261#ifndef	__thumb2__
262	ldr	r14,[sp,#96+4]
263#else
264	ldrd	r12,r14,[sp,#96]
265#endif
266	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
267	@ mov	r9,r9,ror#32-7
268	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
269	eor	r12,r12,r2
270#ifndef	__thumb2__
271	ldr	r2,[sp,#48]
272#endif
273	eor	r14,r14,r3
274#ifndef	__thumb2__
275	ldr	r3,[sp,#48+4]
276#else
277	ldrd	r2,r3,[sp,#48]
278#endif
279	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
280	 ldr	r12,[sp,#444]			@ load counter
281	eor	r2,r2,r10
282	 adr	r10,iotas32
283	mov	r4,r14,ror#32-22
284	 add	r14,r10,r12
285	eor	r3,r3,r11
286	ldmia	r14,{r10,r11}		@ iotas[i]
287	bic	r12,r4,r2,ror#32-22
288	bic	r14,r5,r3,ror#32-22
289	 mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
290	 mov	r3,r3,ror#32-22
291	eor	r12,r12,r0
292	eor	r14,r14,r1
293	eor	r10,r10,r12
294	eor	r11,r11,r14
295#ifndef	__thumb2__
296	str	r10,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
297#endif
298	bic	r12,r6,r4,ror#11
299#ifndef	__thumb2__
300	str	r11,[sp,#240+4]
301#else
302	strd	r10,r11,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
303#endif
304	bic	r14,r7,r5,ror#10
305	bic	r10,r8,r6,ror#32-(11-7)
306	bic	r11,r9,r7,ror#32-(10-7)
307	eor	r12,r2,r12,ror#32-11
308#ifndef	__thumb2__
309	str	r12,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
310#endif
311	eor	r14,r3,r14,ror#32-10
312#ifndef	__thumb2__
313	str	r14,[sp,#248+4]
314#else
315	strd	r12,r14,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
316#endif
317	eor	r10,r4,r10,ror#32-7
318	eor	r11,r5,r11,ror#32-7
319#ifndef	__thumb2__
320	str	r10,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
321#endif
322	bic	r12,r0,r8,ror#32-7
323#ifndef	__thumb2__
324	str	r11,[sp,#256+4]
325#else
326	strd	r10,r11,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
327#endif
328	bic	r14,r1,r9,ror#32-7
329	eor	r12,r12,r6,ror#32-11
330#ifndef	__thumb2__
331	str	r12,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
332#endif
333	eor	r14,r14,r7,ror#32-10
334#ifndef	__thumb2__
335	str	r14,[sp,#264+4]
336#else
337	strd	r12,r14,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
338#endif
339	bic	r10,r2,r0
340	 add	r14,sp,#224
341#ifndef	__thumb2__
342	ldr	r0,[sp,#24]		@ A[0][3]
343#endif
344	bic	r11,r3,r1
345#ifndef	__thumb2__
346	ldr	r1,[sp,#24+4]
347#else
348	ldrd	r0,r1,[sp,#24]		@ A[0][3]
349#endif
350	eor	r10,r10,r8,ror#32-7
351	eor	r11,r11,r9,ror#32-7
352#ifndef	__thumb2__
353	str	r10,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
354#endif
355	 add	r9,sp,#200
356#ifndef	__thumb2__
357	str	r11,[sp,#272+4]
358#else
359	strd	r10,r11,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
360#endif
361
362	ldmia	r14,{r10-r12,r14}	@ D[3..4]
363	ldmia	r9,{r6-r9}		@ D[0..1]
364
365#ifndef	__thumb2__
366	ldr	r2,[sp,#72]		@ A[1][4]
367#endif
368	eor	r0,r0,r10
369#ifndef	__thumb2__
370	ldr	r3,[sp,#72+4]
371#else
372	ldrd	r2,r3,[sp,#72]		@ A[1][4]
373#endif
374	eor	r1,r1,r11
375	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
376#ifndef	__thumb2__
377	ldr	r10,[sp,#128]		@ A[3][1]
378#endif
379	@ mov	r1,r1,ror#32-14
380#ifndef	__thumb2__
381	ldr	r11,[sp,#128+4]
382#else
383	ldrd	r10,r11,[sp,#128]		@ A[3][1]
384#endif
385
386	eor	r2,r2,r12
387#ifndef	__thumb2__
388	ldr	r4,[sp,#80]		@ A[2][0]
389#endif
390	eor	r3,r3,r14
391#ifndef	__thumb2__
392	ldr	r5,[sp,#80+4]
393#else
394	ldrd	r4,r5,[sp,#80]		@ A[2][0]
395#endif
396	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
397	@ mov	r3,r3,ror#32-10
398
399	eor	r6,r6,r4
400#ifndef	__thumb2__
401	ldr	r12,[sp,#216]		@ D[2]
402#endif
403	eor	r7,r7,r5
404#ifndef	__thumb2__
405	ldr	r14,[sp,#216+4]
406#else
407	ldrd	r12,r14,[sp,#216]		@ D[2]
408#endif
409	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
410	mov	r4,r7,ror#32-2
411
412	eor	r10,r10,r8
413#ifndef	__thumb2__
414	ldr	r8,[sp,#176]		@ A[4][2]
415#endif
416	eor	r11,r11,r9
417#ifndef	__thumb2__
418	ldr	r9,[sp,#176+4]
419#else
420	ldrd	r8,r9,[sp,#176]		@ A[4][2]
421#endif
422	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
423	mov	r6,r11,ror#32-23
424
425	bic	r10,r4,r2,ror#32-10
426	bic	r11,r5,r3,ror#32-10
427	 eor	r12,r12,r8
428	 eor	r14,r14,r9
429	 mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
430	 mov	r8,r14,ror#32-31
431	eor	r10,r10,r0,ror#32-14
432	eor	r11,r11,r1,ror#32-14
433#ifndef	__thumb2__
434	str	r10,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
435#endif
436	bic	r12,r6,r4
437#ifndef	__thumb2__
438	str	r11,[sp,#280+4]
439#else
440	strd	r10,r11,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
441#endif
442	bic	r14,r7,r5
443	eor	r12,r12,r2,ror#32-10
444#ifndef	__thumb2__
445	str	r12,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
446#endif
447	eor	r14,r14,r3,ror#32-10
448#ifndef	__thumb2__
449	str	r14,[sp,#288+4]
450#else
451	strd	r12,r14,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
452#endif
453	bic	r10,r8,r6
454	bic	r11,r9,r7
455	bic	r12,r0,r8,ror#14
456	bic	r14,r1,r9,ror#14
457	eor	r10,r10,r4
458	eor	r11,r11,r5
459#ifndef	__thumb2__
460	str	r10,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
461#endif
462	bic	r2,r2,r0,ror#32-(14-10)
463#ifndef	__thumb2__
464	str	r11,[sp,#296+4]
465#else
466	strd	r10,r11,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
467#endif
468	eor	r12,r6,r12,ror#32-14
469	bic	r11,r3,r1,ror#32-(14-10)
470#ifndef	__thumb2__
471	str	r12,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
472#endif
473	eor	r14,r7,r14,ror#32-14
474#ifndef	__thumb2__
475	str	r14,[sp,#304+4]
476#else
477	strd	r12,r14,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
478#endif
479	 add	r12,sp,#208
480#ifndef	__thumb2__
481	ldr	r1,[sp,#8]		@ A[0][1]
482#endif
483	eor	r10,r8,r2,ror#32-10
484#ifndef	__thumb2__
485	ldr	r0,[sp,#8+4]
486#else
487	ldrd	r1,r0,[sp,#8]		@ A[0][1]
488#endif
489	eor	r11,r9,r11,ror#32-10
490#ifndef	__thumb2__
491	str	r10,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
492#endif
493#ifndef	__thumb2__
494	str	r11,[sp,#312+4]
495#else
496	strd	r10,r11,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
497#endif
498
499	add	r9,sp,#224
500	ldmia	r12,{r10-r12,r14}	@ D[1..2]
501#ifndef	__thumb2__
502	ldr	r2,[sp,#56]		@ A[1][2]
503#endif
504#ifndef	__thumb2__
505	ldr	r3,[sp,#56+4]
506#else
507	ldrd	r2,r3,[sp,#56]		@ A[1][2]
508#endif
509	ldmia	r9,{r6-r9}		@ D[3..4]
510
511	eor	r1,r1,r10
512#ifndef	__thumb2__
513	ldr	r4,[sp,#104]		@ A[2][3]
514#endif
515	eor	r0,r0,r11
516#ifndef	__thumb2__
517	ldr	r5,[sp,#104+4]
518#else
519	ldrd	r4,r5,[sp,#104]		@ A[2][3]
520#endif
521	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
522
523	eor	r2,r2,r12
524#ifndef	__thumb2__
525	ldr	r10,[sp,#152]		@ A[3][4]
526#endif
527	eor	r3,r3,r14
528#ifndef	__thumb2__
529	ldr	r11,[sp,#152+4]
530#else
531	ldrd	r10,r11,[sp,#152]		@ A[3][4]
532#endif
533	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
534#ifndef	__thumb2__
535	ldr	r12,[sp,#200]		@ D[0]
536#endif
537	@ mov	r3,r3,ror#32-3
538#ifndef	__thumb2__
539	ldr	r14,[sp,#200+4]
540#else
541	ldrd	r12,r14,[sp,#200]		@ D[0]
542#endif
543
544	eor	r4,r4,r6
545	eor	r5,r5,r7
546	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
547	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
548
549	eor	r10,r10,r8
550#ifndef	__thumb2__
551	ldr	r8,[sp,#160]		@ A[4][0]
552#endif
553	eor	r11,r11,r9
554#ifndef	__thumb2__
555	ldr	r9,[sp,#160+4]
556#else
557	ldrd	r8,r9,[sp,#160]		@ A[4][0]
558#endif
559	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
560	mov	r7,r11,ror#32-4
561
562	eor	r12,r12,r8
563	eor	r14,r14,r9
564	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
565	mov	r9,r14,ror#32-9
566
567	bic	r10,r5,r2,ror#13-3
568	bic	r11,r4,r3,ror#12-3
569	bic	r12,r6,r5,ror#32-13
570	bic	r14,r7,r4,ror#32-12
571	eor	r10,r0,r10,ror#32-13
572	eor	r11,r1,r11,ror#32-12
573#ifndef	__thumb2__
574	str	r10,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
575#endif
576	eor	r12,r12,r2,ror#32-3
577#ifndef	__thumb2__
578	str	r11,[sp,#320+4]
579#else
580	strd	r10,r11,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
581#endif
582	eor	r14,r14,r3,ror#32-3
583#ifndef	__thumb2__
584	str	r12,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
585#endif
586	bic	r10,r8,r6
587	bic	r11,r9,r7
588#ifndef	__thumb2__
589	str	r14,[sp,#328+4]
590#else
591	strd	r12,r14,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
592#endif
593	eor	r10,r10,r5,ror#32-13
594	eor	r11,r11,r4,ror#32-12
595#ifndef	__thumb2__
596	str	r10,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
597#endif
598	bic	r12,r0,r8
599#ifndef	__thumb2__
600	str	r11,[sp,#336+4]
601#else
602	strd	r10,r11,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
603#endif
604	bic	r14,r1,r9
605	eor	r12,r12,r6
606	eor	r14,r14,r7
607#ifndef	__thumb2__
608	str	r12,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
609#endif
610	bic	r10,r2,r0,ror#3
611#ifndef	__thumb2__
612	str	r14,[sp,#344+4]
613#else
614	strd	r12,r14,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
615#endif
616	bic	r11,r3,r1,ror#3
617#ifndef	__thumb2__
618	ldr	r1,[sp,#32]		@ A[0][4] [in reverse order]
619#endif
620	eor	r10,r8,r10,ror#32-3
621#ifndef	__thumb2__
622	ldr	r0,[sp,#32+4]
623#else
624	ldrd	r1,r0,[sp,#32]		@ A[0][4] [in reverse order]
625#endif
626	eor	r11,r9,r11,ror#32-3
627#ifndef	__thumb2__
628	str	r10,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
629#endif
630	 add	r9,sp,#208
631#ifndef	__thumb2__
632	str	r11,[sp,#352+4]
633#else
634	strd	r10,r11,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
635#endif
636
637#ifndef	__thumb2__
638	ldr	r10,[sp,#232]		@ D[4]
639#endif
640#ifndef	__thumb2__
641	ldr	r11,[sp,#232+4]
642#else
643	ldrd	r10,r11,[sp,#232]		@ D[4]
644#endif
645#ifndef	__thumb2__
646	ldr	r12,[sp,#200]		@ D[0]
647#endif
648#ifndef	__thumb2__
649	ldr	r14,[sp,#200+4]
650#else
651	ldrd	r12,r14,[sp,#200]		@ D[0]
652#endif
653
654	ldmia	r9,{r6-r9}		@ D[1..2]
655
656	eor	r1,r1,r10
657#ifndef	__thumb2__
658	ldr	r2,[sp,#40]		@ A[1][0]
659#endif
660	eor	r0,r0,r11
661#ifndef	__thumb2__
662	ldr	r3,[sp,#40+4]
663#else
664	ldrd	r2,r3,[sp,#40]		@ A[1][0]
665#endif
666	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
667#ifndef	__thumb2__
668	ldr	r4,[sp,#88]		@ A[2][1]
669#endif
670	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
671#ifndef	__thumb2__
672	ldr	r5,[sp,#88+4]
673#else
674	ldrd	r4,r5,[sp,#88]		@ A[2][1]
675#endif
676
677	eor	r2,r2,r12
678#ifndef	__thumb2__
679	ldr	r10,[sp,#136]		@ A[3][2]
680#endif
681	eor	r3,r3,r14
682#ifndef	__thumb2__
683	ldr	r11,[sp,#136+4]
684#else
685	ldrd	r10,r11,[sp,#136]		@ A[3][2]
686#endif
687	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
688#ifndef	__thumb2__
689	ldr	r12,[sp,#224]		@ D[3]
690#endif
691	@ mov	r3,r3,ror#32-18
692#ifndef	__thumb2__
693	ldr	r14,[sp,#224+4]
694#else
695	ldrd	r12,r14,[sp,#224]		@ D[3]
696#endif
697
698	eor	r6,r6,r4
699	eor	r7,r7,r5
700	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
701	mov	r5,r7,ror#32-5
702
703	eor	r10,r10,r8
704#ifndef	__thumb2__
705	ldr	r8,[sp,#184]		@ A[4][3]
706#endif
707	eor	r11,r11,r9
708#ifndef	__thumb2__
709	ldr	r9,[sp,#184+4]
710#else
711	ldrd	r8,r9,[sp,#184]		@ A[4][3]
712#endif
713	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
714	mov	r6,r11,ror#32-8
715
716	eor	r12,r12,r8
717	eor	r14,r14,r9
718	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
719	mov	r9,r14,ror#32-28
720
721	bic	r10,r4,r2,ror#32-18
722	bic	r11,r5,r3,ror#32-18
723	eor	r10,r10,r0,ror#32-14
724	eor	r11,r11,r1,ror#32-13
725#ifndef	__thumb2__
726	str	r10,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
727#endif
728	bic	r12,r6,r4
729#ifndef	__thumb2__
730	str	r11,[sp,#360+4]
731#else
732	strd	r10,r11,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
733#endif
734	bic	r14,r7,r5
735	eor	r12,r12,r2,ror#32-18
736#ifndef	__thumb2__
737	str	r12,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
738#endif
739	eor	r14,r14,r3,ror#32-18
740#ifndef	__thumb2__
741	str	r14,[sp,#368+4]
742#else
743	strd	r12,r14,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
744#endif
745	bic	r10,r8,r6
746	bic	r11,r9,r7
747	bic	r12,r0,r8,ror#14
748	bic	r14,r1,r9,ror#13
749	eor	r10,r10,r4
750	eor	r11,r11,r5
751#ifndef	__thumb2__
752	str	r10,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
753#endif
754	bic	r2,r2,r0,ror#18-14
755#ifndef	__thumb2__
756	str	r11,[sp,#376+4]
757#else
758	strd	r10,r11,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
759#endif
760	eor	r12,r6,r12,ror#32-14
761	bic	r11,r3,r1,ror#18-13
762	eor	r14,r7,r14,ror#32-13
763#ifndef	__thumb2__
764	str	r12,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
765#endif
766#ifndef	__thumb2__
767	str	r14,[sp,#384+4]
768#else
769	strd	r12,r14,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
770#endif
771	 add	r14,sp,#216
772#ifndef	__thumb2__
773	ldr	r0,[sp,#16]		@ A[0][2]
774#endif
775	eor	r10,r8,r2,ror#32-18
776#ifndef	__thumb2__
777	ldr	r1,[sp,#16+4]
778#else
779	ldrd	r0,r1,[sp,#16]		@ A[0][2]
780#endif
781	eor	r11,r9,r11,ror#32-18
782#ifndef	__thumb2__
783	str	r10,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
784#endif
785#ifndef	__thumb2__
786	str	r11,[sp,#392+4]
787#else
788	strd	r10,r11,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
789#endif
790
791	ldmia	r14,{r10-r12,r14}	@ D[2..3]
792#ifndef	__thumb2__
793	ldr	r2,[sp,#64]		@ A[1][3]
794#endif
795#ifndef	__thumb2__
796	ldr	r3,[sp,#64+4]
797#else
798	ldrd	r2,r3,[sp,#64]		@ A[1][3]
799#endif
800#ifndef	__thumb2__
801	ldr	r6,[sp,#232]		@ D[4]
802#endif
803#ifndef	__thumb2__
804	ldr	r7,[sp,#232+4]
805#else
806	ldrd	r6,r7,[sp,#232]		@ D[4]
807#endif
808
809	eor	r0,r0,r10
810#ifndef	__thumb2__
811	ldr	r4,[sp,#112]		@ A[2][4]
812#endif
813	eor	r1,r1,r11
814#ifndef	__thumb2__
815	ldr	r5,[sp,#112+4]
816#else
817	ldrd	r4,r5,[sp,#112]		@ A[2][4]
818#endif
819	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
820#ifndef	__thumb2__
821	ldr	r8,[sp,#200]		@ D[0]
822#endif
823	@ mov	r1,r1,ror#32-31
824#ifndef	__thumb2__
825	ldr	r9,[sp,#200+4]
826#else
827	ldrd	r8,r9,[sp,#200]		@ D[0]
828#endif
829
830	eor	r12,r12,r2
831#ifndef	__thumb2__
832	ldr	r10,[sp,#120]		@ A[3][0]
833#endif
834	eor	r14,r14,r3
835#ifndef	__thumb2__
836	ldr	r11,[sp,#120+4]
837#else
838	ldrd	r10,r11,[sp,#120]		@ A[3][0]
839#endif
840	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
841#ifndef	__thumb2__
842	ldr	r12,[sp,#208]		@ D[1]
843#endif
844	mov	r2,r14,ror#32-28
845#ifndef	__thumb2__
846	ldr	r14,[sp,#208+4]
847#else
848	ldrd	r12,r14,[sp,#208]		@ D[1]
849#endif
850
851	eor	r6,r6,r4
852	eor	r7,r7,r5
853	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
854	mov	r4,r7,ror#32-20
855
856	eor	r10,r10,r8
857#ifndef	__thumb2__
858	ldr	r8,[sp,#168]		@ A[4][1]
859#endif
860	eor	r11,r11,r9
861#ifndef	__thumb2__
862	ldr	r9,[sp,#168+4]
863#else
864	ldrd	r8,r9,[sp,#168]		@ A[4][1]
865#endif
866	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
867	mov	r6,r11,ror#32-21
868
869	eor	r8,r8,r12
870	eor	r9,r9,r14
871	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
872	@ mov	r9,r3,ror#32-1
873
874	bic	r10,r4,r2
875	bic	r11,r5,r3
876	eor	r10,r10,r0,ror#32-31
877#ifndef	__thumb2__
878	str	r10,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
879#endif
880	eor	r11,r11,r1,ror#32-31
881#ifndef	__thumb2__
882	str	r11,[sp,#400+4]
883#else
884	strd	r10,r11,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
885#endif
886	bic	r12,r6,r4
887	bic	r14,r7,r5
888	eor	r12,r12,r2
889	eor	r14,r14,r3
890#ifndef	__thumb2__
891	str	r12,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
892#endif
893	bic	r10,r8,r6,ror#1
894#ifndef	__thumb2__
895	str	r14,[sp,#408+4]
896#else
897	strd	r12,r14,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
898#endif
899	bic	r11,r9,r7,ror#1
900	bic	r12,r0,r8,ror#31-1
901	bic	r14,r1,r9,ror#31-1
902	eor	r4,r4,r10,ror#32-1
903#ifndef	__thumb2__
904	str	r4,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
905#endif
906	eor	r5,r5,r11,ror#32-1
907#ifndef	__thumb2__
908	str	r5,[sp,#416+4]
909#else
910	strd	r4,r5,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
911#endif
912	eor	r6,r6,r12,ror#32-31
913	eor	r7,r7,r14,ror#32-31
914#ifndef	__thumb2__
915	str	r6,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
916#endif
917	bic	r10,r2,r0,ror#32-31
918#ifndef	__thumb2__
919	str	r7,[sp,#424+4]
920#else
921	strd	r6,r7,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
922#endif
923	bic	r11,r3,r1,ror#32-31
924	 add	r12,sp,#240
925	eor	r8,r10,r8,ror#32-1
926	 add	r10,sp,#280
927	eor	r9,r11,r9,ror#32-1
928#ifndef	__thumb2__
929	str	r8,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
930#endif
931#ifndef	__thumb2__
932	str	r9,[sp,#432+4]
933#else
934	strd	r8,r9,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
935#endif
936	ldmia	r12,{r0-r3}		@ A[0][0..1]
937	ldmia	r10,{r10-r12,r14}	@ A[1][0..1]
938#ifdef	__thumb2__
939	eor	r0,r0,r10
940	eor	r1,r1,r11
941	eor	r2,r2,r12
942	ldrd	r10,r11,[sp,#296]
943	eor	r3,r3,r14
944	ldrd	r12,r14,[sp,#304]
945	eor	r4,r4,r10
946	eor	r5,r5,r11
947	eor	r6,r6,r12
948	ldrd	r10,r11,[sp,#312]
949	eor	r7,r7,r14
950	ldrd	r12,r14,[sp,#320]
951	eor	r8,r8,r10
952	eor	r9,r9,r11
953	eor	r0,r0,r12
954	ldrd	r10,r11,[sp,#328]
955	eor	r1,r1,r14
956	ldrd	r12,r14,[sp,#336]
957	eor	r2,r2,r10
958	eor	r3,r3,r11
959	eor	r4,r4,r12
960	ldrd	r10,r11,[sp,#344]
961	eor	r5,r5,r14
962	ldrd	r12,r14,[sp,#352]
963	eor	r6,r6,r10
964	eor	r7,r7,r11
965	eor	r8,r8,r12
966	ldrd	r10,r11,[sp,#360]
967	eor	r9,r9,r14
968	ldrd	r12,r14,[sp,#368]
969	eor	r0,r0,r10
970	eor	r1,r1,r11
971	eor	r2,r2,r12
972	ldrd	r10,r11,[sp,#376]
973	eor	r3,r3,r14
974	ldrd	r12,r14,[sp,#384]
975	eor	r4,r4,r10
976	eor	r5,r5,r11
977	eor	r6,r6,r12
978	ldrd	r10,r11,[sp,#392]
979	eor	r7,r7,r14
980	ldrd	r12,r14,[sp,#400]
981	eor	r8,r8,r10
982	eor	r9,r9,r11
983	eor	r0,r0,r12
984	ldrd	r10,r11,[sp,#408]
985	eor	r1,r1,r14
986	ldrd	r12,r14,[sp,#256]
987	eor	r2,r2,r10
988	eor	r3,r3,r11
989	eor	r4,r4,r12
990	ldrd	r10,r11,[sp,#264]
991	eor	r5,r5,r14
992	ldrd	r12,r14,[sp,#272]
993#else
994	eor	r0,r0,r10
995	 add	r10,sp,#296
996	eor	r1,r1,r11
997	eor	r2,r2,r12
998	eor	r3,r3,r14
999	ldmia	r10,{r10-r12,r14}	@ A[1][2..3]
1000	eor	r4,r4,r10
1001	 add	r10,sp,#312
1002	eor	r5,r5,r11
1003	eor	r6,r6,r12
1004	eor	r7,r7,r14
1005	ldmia	r10,{r10-r12,r14}	@ A[1][4]..A[2][0]
1006	eor	r8,r8,r10
1007	 add	r10,sp,#328
1008	eor	r9,r9,r11
1009	eor	r0,r0,r12
1010	eor	r1,r1,r14
1011	ldmia	r10,{r10-r12,r14}	@ A[2][1..2]
1012	eor	r2,r2,r10
1013	 add	r10,sp,#344
1014	eor	r3,r3,r11
1015	eor	r4,r4,r12
1016	eor	r5,r5,r14
1017	ldmia	r10,{r10-r12,r14}	@ A[2][3..4]
1018	eor	r6,r6,r10
1019	 add	r10,sp,#360
1020	eor	r7,r7,r11
1021	eor	r8,r8,r12
1022	eor	r9,r9,r14
1023	ldmia	r10,{r10-r12,r14}	@ A[3][0..1]
1024	eor	r0,r0,r10
1025	 add	r10,sp,#376
1026	eor	r1,r1,r11
1027	eor	r2,r2,r12
1028	eor	r3,r3,r14
1029	ldmia	r10,{r10-r12,r14}	@ A[3][2..3]
1030	eor	r4,r4,r10
1031	 add	r10,sp,#392
1032	eor	r5,r5,r11
1033	eor	r6,r6,r12
1034	eor	r7,r7,r14
1035	ldmia	r10,{r10-r12,r14}	@ A[3][4]..A[4][0]
1036	eor	r8,r8,r10
1037	ldr	r10,[sp,#408]		@ A[4][1]
1038	eor	r9,r9,r11
1039	ldr	r11,[sp,#408+4]
1040	eor	r0,r0,r12
1041	ldr	r12,[sp,#256]		@ A[0][2]
1042	eor	r1,r1,r14
1043	ldr	r14,[sp,#256+4]
1044	eor	r2,r2,r10
1045	 add	r10,sp,#264
1046	eor	r3,r3,r11
1047	eor	r4,r4,r12
1048	eor	r5,r5,r14
1049	ldmia	r10,{r10-r12,r14}	@ A[0][3..4]
1050#endif
1051	eor	r6,r6,r10
1052	eor	r7,r7,r11
1053	eor	r8,r8,r12
1054	eor	r9,r9,r14
1055
1056	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
1057#ifndef	__thumb2__
1058	str	r10,[sp,#208]		@ D[1] = E[0]
1059#endif
1060	eor	r11,r1,r4
1061#ifndef	__thumb2__
1062	str	r11,[sp,#208+4]
1063#else
1064	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
1065#endif
1066	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
1067	eor	r14,r7,r0
1068#ifndef	__thumb2__
1069	str	r12,[sp,#232]		@ D[4] = E[1]
1070#endif
1071	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
1072#ifndef	__thumb2__
1073	str	r14,[sp,#232+4]
1074#else
1075	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
1076#endif
1077	eor	r1,r9,r2
1078#ifndef	__thumb2__
1079	str	r0,[sp,#200]		@ D[0] = C[0]
1080#endif
1081	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
1082#ifndef	__thumb2__
1083	ldr	r7,[sp,#384]
1084#endif
1085	eor	r3,r3,r6
1086#ifndef	__thumb2__
1087	str	r1,[sp,#200+4]
1088#else
1089	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
1090#endif
1091#ifndef	__thumb2__
1092	ldr	r6,[sp,#384+4]
1093#else
1094	ldrd	r7,r6,[sp,#384]
1095#endif
1096#ifndef	__thumb2__
1097	str	r2,[sp,#216]		@ D[2] = C[1]
1098#endif
1099	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
1100#ifndef	__thumb2__
1101	str	r3,[sp,#216+4]
1102#else
1103	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
1104#endif
1105	eor	r5,r5,r8
1106
1107#ifndef	__thumb2__
1108	ldr	r8,[sp,#432]
1109#endif
1110#ifndef	__thumb2__
1111	ldr	r9,[sp,#432+4]
1112#else
1113	ldrd	r8,r9,[sp,#432]
1114#endif
1115#ifndef	__thumb2__
1116	str	r4,[sp,#224]		@ D[3] = C[2]
1117#endif
1118	eor	r7,r7,r4
1119#ifndef	__thumb2__
1120	str	r5,[sp,#224+4]
1121#else
1122	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
1123#endif
1124	eor	r6,r6,r5
1125#ifndef	__thumb2__
1126	ldr	r4,[sp,#240]
1127#endif
1128	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
1129	@ mov	r6,r6,ror#32-11
1130#ifndef	__thumb2__
1131	ldr	r5,[sp,#240+4]
1132#else
1133	ldrd	r4,r5,[sp,#240]
1134#endif
1135	eor	r8,r8,r12
1136	eor	r9,r9,r14
1137#ifndef	__thumb2__
1138	ldr	r12,[sp,#336]
1139#endif
1140	eor	r0,r0,r4
1141#ifndef	__thumb2__
1142	ldr	r14,[sp,#336+4]
1143#else
1144	ldrd	r12,r14,[sp,#336]
1145#endif
1146	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
1147	@ mov	r9,r9,ror#32-7
1148	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
1149	eor	r12,r12,r2
1150#ifndef	__thumb2__
1151	ldr	r2,[sp,#288]
1152#endif
1153	eor	r14,r14,r3
1154#ifndef	__thumb2__
1155	ldr	r3,[sp,#288+4]
1156#else
1157	ldrd	r2,r3,[sp,#288]
1158#endif
1159	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
1160	 ldr	r12,[sp,#444]			@ load counter
1161	eor	r2,r2,r10
1162	 adr	r10,iotas32
1163	mov	r4,r14,ror#32-22
1164	 add	r14,r10,r12
1165	eor	r3,r3,r11
1166#ifndef	__thumb2__
1167	ldr	r10,[r14,#8]		@ iotas[i].lo
1168#endif
1169	add	r12,r12,#16
1170#ifndef	__thumb2__
1171	ldr	r11,[r14,#12]		@ iotas[i].hi
1172#else
1173	ldrd	r10,r11,[r14,#8]		@ iotas[i].lo
1174#endif
1175	cmp	r12,#192
1176	str	r12,[sp,#444]			@ store counter
1177	bic	r12,r4,r2,ror#32-22
1178	bic	r14,r5,r3,ror#32-22
1179	 mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
1180	 mov	r3,r3,ror#32-22
1181	eor	r12,r12,r0
1182	eor	r14,r14,r1
1183	eor	r10,r10,r12
1184	eor	r11,r11,r14
1185#ifndef	__thumb2__
1186	str	r10,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1187#endif
1188	bic	r12,r6,r4,ror#11
1189#ifndef	__thumb2__
1190	str	r11,[sp,#0+4]
1191#else
1192	strd	r10,r11,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1193#endif
1194	bic	r14,r7,r5,ror#10
1195	bic	r10,r8,r6,ror#32-(11-7)
1196	bic	r11,r9,r7,ror#32-(10-7)
1197	eor	r12,r2,r12,ror#32-11
1198#ifndef	__thumb2__
1199	str	r12,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1200#endif
1201	eor	r14,r3,r14,ror#32-10
1202#ifndef	__thumb2__
1203	str	r14,[sp,#8+4]
1204#else
1205	strd	r12,r14,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1206#endif
1207	eor	r10,r4,r10,ror#32-7
1208	eor	r11,r5,r11,ror#32-7
1209#ifndef	__thumb2__
1210	str	r10,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1211#endif
1212	bic	r12,r0,r8,ror#32-7
1213#ifndef	__thumb2__
1214	str	r11,[sp,#16+4]
1215#else
1216	strd	r10,r11,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1217#endif
1218	bic	r14,r1,r9,ror#32-7
1219	eor	r12,r12,r6,ror#32-11
1220#ifndef	__thumb2__
1221	str	r12,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1222#endif
1223	eor	r14,r14,r7,ror#32-10
1224#ifndef	__thumb2__
1225	str	r14,[sp,#24+4]
1226#else
1227	strd	r12,r14,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1228#endif
1229	bic	r10,r2,r0
1230	 add	r14,sp,#224
1231#ifndef	__thumb2__
1232	ldr	r0,[sp,#264]		@ A[0][3]
1233#endif
1234	bic	r11,r3,r1
1235#ifndef	__thumb2__
1236	ldr	r1,[sp,#264+4]
1237#else
1238	ldrd	r0,r1,[sp,#264]		@ A[0][3]
1239#endif
1240	eor	r10,r10,r8,ror#32-7
1241	eor	r11,r11,r9,ror#32-7
1242#ifndef	__thumb2__
1243	str	r10,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1244#endif
1245	 add	r9,sp,#200
1246#ifndef	__thumb2__
1247	str	r11,[sp,#32+4]
1248#else
1249	strd	r10,r11,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1250#endif
1251
1252	ldmia	r14,{r10-r12,r14}	@ D[3..4]
1253	ldmia	r9,{r6-r9}		@ D[0..1]
1254
1255#ifndef	__thumb2__
1256	ldr	r2,[sp,#312]		@ A[1][4]
1257#endif
1258	eor	r0,r0,r10
1259#ifndef	__thumb2__
1260	ldr	r3,[sp,#312+4]
1261#else
1262	ldrd	r2,r3,[sp,#312]		@ A[1][4]
1263#endif
1264	eor	r1,r1,r11
1265	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
1266#ifndef	__thumb2__
1267	ldr	r10,[sp,#368]		@ A[3][1]
1268#endif
1269	@ mov	r1,r1,ror#32-14
1270#ifndef	__thumb2__
1271	ldr	r11,[sp,#368+4]
1272#else
1273	ldrd	r10,r11,[sp,#368]		@ A[3][1]
1274#endif
1275
1276	eor	r2,r2,r12
1277#ifndef	__thumb2__
1278	ldr	r4,[sp,#320]		@ A[2][0]
1279#endif
1280	eor	r3,r3,r14
1281#ifndef	__thumb2__
1282	ldr	r5,[sp,#320+4]
1283#else
1284	ldrd	r4,r5,[sp,#320]		@ A[2][0]
1285#endif
1286	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
1287	@ mov	r3,r3,ror#32-10
1288
1289	eor	r6,r6,r4
1290#ifndef	__thumb2__
1291	ldr	r12,[sp,#216]		@ D[2]
1292#endif
1293	eor	r7,r7,r5
1294#ifndef	__thumb2__
1295	ldr	r14,[sp,#216+4]
1296#else
1297	ldrd	r12,r14,[sp,#216]		@ D[2]
1298#endif
1299	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
1300	mov	r4,r7,ror#32-2
1301
1302	eor	r10,r10,r8
1303#ifndef	__thumb2__
1304	ldr	r8,[sp,#416]		@ A[4][2]
1305#endif
1306	eor	r11,r11,r9
1307#ifndef	__thumb2__
1308	ldr	r9,[sp,#416+4]
1309#else
1310	ldrd	r8,r9,[sp,#416]		@ A[4][2]
1311#endif
1312	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
1313	mov	r6,r11,ror#32-23
1314
1315	bic	r10,r4,r2,ror#32-10
1316	bic	r11,r5,r3,ror#32-10
1317	 eor	r12,r12,r8
1318	 eor	r14,r14,r9
1319	 mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
1320	 mov	r8,r14,ror#32-31
1321	eor	r10,r10,r0,ror#32-14
1322	eor	r11,r11,r1,ror#32-14
1323#ifndef	__thumb2__
1324	str	r10,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1325#endif
1326	bic	r12,r6,r4
1327#ifndef	__thumb2__
1328	str	r11,[sp,#40+4]
1329#else
1330	strd	r10,r11,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1331#endif
1332	bic	r14,r7,r5
1333	eor	r12,r12,r2,ror#32-10
1334#ifndef	__thumb2__
1335	str	r12,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1336#endif
1337	eor	r14,r14,r3,ror#32-10
1338#ifndef	__thumb2__
1339	str	r14,[sp,#48+4]
1340#else
1341	strd	r12,r14,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1342#endif
1343	bic	r10,r8,r6
1344	bic	r11,r9,r7
1345	bic	r12,r0,r8,ror#14
1346	bic	r14,r1,r9,ror#14
1347	eor	r10,r10,r4
1348	eor	r11,r11,r5
1349#ifndef	__thumb2__
1350	str	r10,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1351#endif
1352	bic	r2,r2,r0,ror#32-(14-10)
1353#ifndef	__thumb2__
1354	str	r11,[sp,#56+4]
1355#else
1356	strd	r10,r11,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1357#endif
1358	eor	r12,r6,r12,ror#32-14
1359	bic	r11,r3,r1,ror#32-(14-10)
1360#ifndef	__thumb2__
1361	str	r12,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1362#endif
1363	eor	r14,r7,r14,ror#32-14
1364#ifndef	__thumb2__
1365	str	r14,[sp,#64+4]
1366#else
1367	strd	r12,r14,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1368#endif
1369	 add	r12,sp,#208
1370#ifndef	__thumb2__
1371	ldr	r1,[sp,#248]		@ A[0][1]
1372#endif
1373	eor	r10,r8,r2,ror#32-10
1374#ifndef	__thumb2__
1375	ldr	r0,[sp,#248+4]
1376#else
1377	ldrd	r1,r0,[sp,#248]		@ A[0][1]
1378#endif
1379	eor	r11,r9,r11,ror#32-10
1380#ifndef	__thumb2__
1381	str	r10,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1382#endif
1383#ifndef	__thumb2__
1384	str	r11,[sp,#72+4]
1385#else
1386	strd	r10,r11,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1387#endif
1388
1389	add	r9,sp,#224
1390	ldmia	r12,{r10-r12,r14}	@ D[1..2]
1391#ifndef	__thumb2__
1392	ldr	r2,[sp,#296]		@ A[1][2]
1393#endif
1394#ifndef	__thumb2__
1395	ldr	r3,[sp,#296+4]
1396#else
1397	ldrd	r2,r3,[sp,#296]		@ A[1][2]
1398#endif
1399	ldmia	r9,{r6-r9}		@ D[3..4]
1400
1401	eor	r1,r1,r10
1402#ifndef	__thumb2__
1403	ldr	r4,[sp,#344]		@ A[2][3]
1404#endif
1405	eor	r0,r0,r11
1406#ifndef	__thumb2__
1407	ldr	r5,[sp,#344+4]
1408#else
1409	ldrd	r4,r5,[sp,#344]		@ A[2][3]
1410#endif
1411	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
1412
1413	eor	r2,r2,r12
1414#ifndef	__thumb2__
1415	ldr	r10,[sp,#392]		@ A[3][4]
1416#endif
1417	eor	r3,r3,r14
1418#ifndef	__thumb2__
1419	ldr	r11,[sp,#392+4]
1420#else
1421	ldrd	r10,r11,[sp,#392]		@ A[3][4]
1422#endif
1423	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
1424#ifndef	__thumb2__
1425	ldr	r12,[sp,#200]		@ D[0]
1426#endif
1427	@ mov	r3,r3,ror#32-3
1428#ifndef	__thumb2__
1429	ldr	r14,[sp,#200+4]
1430#else
1431	ldrd	r12,r14,[sp,#200]		@ D[0]
1432#endif
1433
1434	eor	r4,r4,r6
1435	eor	r5,r5,r7
1436	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
1437	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
1438
1439	eor	r10,r10,r8
1440#ifndef	__thumb2__
1441	ldr	r8,[sp,#400]		@ A[4][0]
1442#endif
1443	eor	r11,r11,r9
1444#ifndef	__thumb2__
1445	ldr	r9,[sp,#400+4]
1446#else
1447	ldrd	r8,r9,[sp,#400]		@ A[4][0]
1448#endif
1449	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
1450	mov	r7,r11,ror#32-4
1451
1452	eor	r12,r12,r8
1453	eor	r14,r14,r9
1454	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
1455	mov	r9,r14,ror#32-9
1456
1457	bic	r10,r5,r2,ror#13-3
1458	bic	r11,r4,r3,ror#12-3
1459	bic	r12,r6,r5,ror#32-13
1460	bic	r14,r7,r4,ror#32-12
1461	eor	r10,r0,r10,ror#32-13
1462	eor	r11,r1,r11,ror#32-12
1463#ifndef	__thumb2__
1464	str	r10,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1465#endif
1466	eor	r12,r12,r2,ror#32-3
1467#ifndef	__thumb2__
1468	str	r11,[sp,#80+4]
1469#else
1470	strd	r10,r11,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1471#endif
1472	eor	r14,r14,r3,ror#32-3
1473#ifndef	__thumb2__
1474	str	r12,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1475#endif
1476	bic	r10,r8,r6
1477	bic	r11,r9,r7
1478#ifndef	__thumb2__
1479	str	r14,[sp,#88+4]
1480#else
1481	strd	r12,r14,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1482#endif
1483	eor	r10,r10,r5,ror#32-13
1484	eor	r11,r11,r4,ror#32-12
1485#ifndef	__thumb2__
1486	str	r10,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1487#endif
1488	bic	r12,r0,r8
1489#ifndef	__thumb2__
1490	str	r11,[sp,#96+4]
1491#else
1492	strd	r10,r11,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1493#endif
1494	bic	r14,r1,r9
1495	eor	r12,r12,r6
1496	eor	r14,r14,r7
1497#ifndef	__thumb2__
1498	str	r12,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1499#endif
1500	bic	r10,r2,r0,ror#3
1501#ifndef	__thumb2__
1502	str	r14,[sp,#104+4]
1503#else
1504	strd	r12,r14,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1505#endif
1506	bic	r11,r3,r1,ror#3
1507#ifndef	__thumb2__
1508	ldr	r1,[sp,#272]		@ A[0][4] [in reverse order]
1509#endif
1510	eor	r10,r8,r10,ror#32-3
1511#ifndef	__thumb2__
1512	ldr	r0,[sp,#272+4]
1513#else
1514	ldrd	r1,r0,[sp,#272]		@ A[0][4] [in reverse order]
1515#endif
1516	eor	r11,r9,r11,ror#32-3
1517#ifndef	__thumb2__
1518	str	r10,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1519#endif
1520	 add	r9,sp,#208
1521#ifndef	__thumb2__
1522	str	r11,[sp,#112+4]
1523#else
1524	strd	r10,r11,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1525#endif
1526
1527#ifndef	__thumb2__
1528	ldr	r10,[sp,#232]		@ D[4]
1529#endif
1530#ifndef	__thumb2__
1531	ldr	r11,[sp,#232+4]
1532#else
1533	ldrd	r10,r11,[sp,#232]		@ D[4]
1534#endif
1535#ifndef	__thumb2__
1536	ldr	r12,[sp,#200]		@ D[0]
1537#endif
1538#ifndef	__thumb2__
1539	ldr	r14,[sp,#200+4]
1540#else
1541	ldrd	r12,r14,[sp,#200]		@ D[0]
1542#endif
1543
1544	ldmia	r9,{r6-r9}		@ D[1..2]
1545
1546	eor	r1,r1,r10
1547#ifndef	__thumb2__
1548	ldr	r2,[sp,#280]		@ A[1][0]
1549#endif
1550	eor	r0,r0,r11
1551#ifndef	__thumb2__
1552	ldr	r3,[sp,#280+4]
1553#else
1554	ldrd	r2,r3,[sp,#280]		@ A[1][0]
1555#endif
1556	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
1557#ifndef	__thumb2__
1558	ldr	r4,[sp,#328]		@ A[2][1]
1559#endif
1560	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
1561#ifndef	__thumb2__
1562	ldr	r5,[sp,#328+4]
1563#else
1564	ldrd	r4,r5,[sp,#328]		@ A[2][1]
1565#endif
1566
1567	eor	r2,r2,r12
1568#ifndef	__thumb2__
1569	ldr	r10,[sp,#376]		@ A[3][2]
1570#endif
1571	eor	r3,r3,r14
1572#ifndef	__thumb2__
1573	ldr	r11,[sp,#376+4]
1574#else
1575	ldrd	r10,r11,[sp,#376]		@ A[3][2]
1576#endif
1577	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
1578#ifndef	__thumb2__
1579	ldr	r12,[sp,#224]		@ D[3]
1580#endif
1581	@ mov	r3,r3,ror#32-18
1582#ifndef	__thumb2__
1583	ldr	r14,[sp,#224+4]
1584#else
1585	ldrd	r12,r14,[sp,#224]		@ D[3]
1586#endif
1587
1588	eor	r6,r6,r4
1589	eor	r7,r7,r5
1590	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
1591	mov	r5,r7,ror#32-5
1592
1593	eor	r10,r10,r8
1594#ifndef	__thumb2__
1595	ldr	r8,[sp,#424]		@ A[4][3]
1596#endif
1597	eor	r11,r11,r9
1598#ifndef	__thumb2__
1599	ldr	r9,[sp,#424+4]
1600#else
1601	ldrd	r8,r9,[sp,#424]		@ A[4][3]
1602#endif
1603	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
1604	mov	r6,r11,ror#32-8
1605
1606	eor	r12,r12,r8
1607	eor	r14,r14,r9
1608	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
1609	mov	r9,r14,ror#32-28
1610
1611	bic	r10,r4,r2,ror#32-18
1612	bic	r11,r5,r3,ror#32-18
1613	eor	r10,r10,r0,ror#32-14
1614	eor	r11,r11,r1,ror#32-13
1615#ifndef	__thumb2__
1616	str	r10,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1617#endif
1618	bic	r12,r6,r4
1619#ifndef	__thumb2__
1620	str	r11,[sp,#120+4]
1621#else
1622	strd	r10,r11,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1623#endif
1624	bic	r14,r7,r5
1625	eor	r12,r12,r2,ror#32-18
1626#ifndef	__thumb2__
1627	str	r12,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1628#endif
1629	eor	r14,r14,r3,ror#32-18
1630#ifndef	__thumb2__
1631	str	r14,[sp,#128+4]
1632#else
1633	strd	r12,r14,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1634#endif
1635	bic	r10,r8,r6
1636	bic	r11,r9,r7
1637	bic	r12,r0,r8,ror#14
1638	bic	r14,r1,r9,ror#13
1639	eor	r10,r10,r4
1640	eor	r11,r11,r5
1641#ifndef	__thumb2__
1642	str	r10,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1643#endif
1644	bic	r2,r2,r0,ror#18-14
1645#ifndef	__thumb2__
1646	str	r11,[sp,#136+4]
1647#else
1648	strd	r10,r11,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1649#endif
1650	eor	r12,r6,r12,ror#32-14
1651	bic	r11,r3,r1,ror#18-13
1652	eor	r14,r7,r14,ror#32-13
1653#ifndef	__thumb2__
1654	str	r12,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1655#endif
1656#ifndef	__thumb2__
1657	str	r14,[sp,#144+4]
1658#else
1659	strd	r12,r14,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1660#endif
1661	 add	r14,sp,#216
1662#ifndef	__thumb2__
1663	ldr	r0,[sp,#256]		@ A[0][2]
1664#endif
1665	eor	r10,r8,r2,ror#32-18
1666#ifndef	__thumb2__
1667	ldr	r1,[sp,#256+4]
1668#else
1669	ldrd	r0,r1,[sp,#256]		@ A[0][2]
1670#endif
1671	eor	r11,r9,r11,ror#32-18
1672#ifndef	__thumb2__
1673	str	r10,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1674#endif
1675#ifndef	__thumb2__
1676	str	r11,[sp,#152+4]
1677#else
1678	strd	r10,r11,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1679#endif
1680
1681	ldmia	r14,{r10-r12,r14}	@ D[2..3]
1682#ifndef	__thumb2__
1683	ldr	r2,[sp,#304]		@ A[1][3]
1684#endif
1685#ifndef	__thumb2__
1686	ldr	r3,[sp,#304+4]
1687#else
1688	ldrd	r2,r3,[sp,#304]		@ A[1][3]
1689#endif
1690#ifndef	__thumb2__
1691	ldr	r6,[sp,#232]		@ D[4]
1692#endif
1693#ifndef	__thumb2__
1694	ldr	r7,[sp,#232+4]
1695#else
1696	ldrd	r6,r7,[sp,#232]		@ D[4]
1697#endif
1698
1699	eor	r0,r0,r10
1700#ifndef	__thumb2__
1701	ldr	r4,[sp,#352]		@ A[2][4]
1702#endif
1703	eor	r1,r1,r11
1704#ifndef	__thumb2__
1705	ldr	r5,[sp,#352+4]
1706#else
1707	ldrd	r4,r5,[sp,#352]		@ A[2][4]
1708#endif
1709	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
1710#ifndef	__thumb2__
1711	ldr	r8,[sp,#200]		@ D[0]
1712#endif
1713	@ mov	r1,r1,ror#32-31
1714#ifndef	__thumb2__
1715	ldr	r9,[sp,#200+4]
1716#else
1717	ldrd	r8,r9,[sp,#200]		@ D[0]
1718#endif
1719
1720	eor	r12,r12,r2
1721#ifndef	__thumb2__
1722	ldr	r10,[sp,#360]		@ A[3][0]
1723#endif
1724	eor	r14,r14,r3
1725#ifndef	__thumb2__
1726	ldr	r11,[sp,#360+4]
1727#else
1728	ldrd	r10,r11,[sp,#360]		@ A[3][0]
1729#endif
1730	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
1731#ifndef	__thumb2__
1732	ldr	r12,[sp,#208]		@ D[1]
1733#endif
1734	mov	r2,r14,ror#32-28
1735#ifndef	__thumb2__
1736	ldr	r14,[sp,#208+4]
1737#else
1738	ldrd	r12,r14,[sp,#208]		@ D[1]
1739#endif
1740
1741	eor	r6,r6,r4
1742	eor	r7,r7,r5
1743	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
1744	mov	r4,r7,ror#32-20
1745
1746	eor	r10,r10,r8
1747#ifndef	__thumb2__
1748	ldr	r8,[sp,#408]		@ A[4][1]
1749#endif
1750	eor	r11,r11,r9
1751#ifndef	__thumb2__
1752	ldr	r9,[sp,#408+4]
1753#else
1754	ldrd	r8,r9,[sp,#408]		@ A[4][1]
1755#endif
1756	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
1757	mov	r6,r11,ror#32-21
1758
1759	eor	r8,r8,r12
1760	eor	r9,r9,r14
1761	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
1762	@ mov	r9,r3,ror#32-1
1763
1764	bic	r10,r4,r2
1765	bic	r11,r5,r3
1766	eor	r10,r10,r0,ror#32-31
1767#ifndef	__thumb2__
1768	str	r10,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1769#endif
1770	eor	r11,r11,r1,ror#32-31
1771#ifndef	__thumb2__
1772	str	r11,[sp,#160+4]
1773#else
1774	strd	r10,r11,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1775#endif
1776	bic	r12,r6,r4
1777	bic	r14,r7,r5
1778	eor	r12,r12,r2
1779	eor	r14,r14,r3
1780#ifndef	__thumb2__
1781	str	r12,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1782#endif
1783	bic	r10,r8,r6,ror#1
1784#ifndef	__thumb2__
1785	str	r14,[sp,#168+4]
1786#else
1787	strd	r12,r14,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1788#endif
1789	bic	r11,r9,r7,ror#1
1790	bic	r12,r0,r8,ror#31-1
1791	bic	r14,r1,r9,ror#31-1
1792	eor	r4,r4,r10,ror#32-1
1793#ifndef	__thumb2__
1794	str	r4,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1795#endif
1796	eor	r5,r5,r11,ror#32-1
1797#ifndef	__thumb2__
1798	str	r5,[sp,#176+4]
1799#else
1800	strd	r4,r5,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1801#endif
1802	eor	r6,r6,r12,ror#32-31
1803	eor	r7,r7,r14,ror#32-31
1804#ifndef	__thumb2__
1805	str	r6,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1806#endif
1807	bic	r10,r2,r0,ror#32-31
1808#ifndef	__thumb2__
1809	str	r7,[sp,#184+4]
1810#else
1811	strd	r6,r7,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1812#endif
1813	bic	r11,r3,r1,ror#32-31
1814	 add	r12,sp,#0
1815	eor	r8,r10,r8,ror#32-1
1816	 add	r10,sp,#40
1817	eor	r9,r11,r9,ror#32-1
1818#ifndef	__thumb2__
1819	str	r8,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1820#endif
1821#ifndef	__thumb2__
1822	str	r9,[sp,#192+4]
1823#else
1824	strd	r8,r9,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1825#endif
1826	blo	.Lround2x
1827
1828	ldr	pc,[sp,#440]
1829.size	KeccakF1600_int,.-KeccakF1600_int
1830
1831.type	KeccakF1600, %function
1832.align	5
1833KeccakF1600:
1834	stmdb	sp!,{r0,r4-r11,lr}
1835	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
1836
1837	add	r10,r0,#40
1838	add	r11,sp,#40
1839	ldmia	r0,    {r0-r9}		@ copy A[5][5] to stack
1840	stmia	sp,    {r0-r9}
1841	ldmia	r10!,{r0-r9}
1842	stmia	r11!,{r0-r9}
1843	ldmia	r10!,{r0-r9}
1844	stmia	r11!,{r0-r9}
1845	ldmia	r10!,{r0-r9}
1846	stmia	r11!,{r0-r9}
1847	ldmia	r10, {r0-r9}
1848	add	r12,sp,#0
1849	add	r10,sp,#40
1850	stmia	r11, {r0-r9}
1851
1852	bl	KeccakF1600_enter
1853
1854	ldr	r11, [sp,#440+16]		@ restore pointer to A
1855	ldmia	sp,    {r0-r9}
1856	stmia	r11!,{r0-r9}		@ return A[5][5]
1857	ldmia	r10!,{r0-r9}
1858	stmia	r11!,{r0-r9}
1859	ldmia	r10!,{r0-r9}
1860	stmia	r11!,{r0-r9}
1861	ldmia	r10!,{r0-r9}
1862	stmia	r11!,{r0-r9}
1863	ldmia	r10, {r0-r9}
1864	stmia	r11, {r0-r9}
1865
1866	add	sp,sp,#440+20
1867	ldmia	sp!,{r4-r11,pc}
1868.size	KeccakF1600,.-KeccakF1600
1869.global	SHA3_absorb
1870.type	SHA3_absorb,%function
1871.align	5
1872SHA3_absorb:
1873	stmdb	sp!,{r0-r12,lr}
1874	sub	sp,sp,#456+16
1875
1876	add	r10,r0,#40
1877	@ mov	r11,r1
1878	mov	r12,r2
1879	mov	r14,r3
1880	cmp	r2,r3
1881	blo	.Labsorb_abort
1882
1883	add	r11,sp,#0
1884	ldmia	r0,      {r0-r9}	@ copy A[5][5] to stack
1885	stmia	r11!,   {r0-r9}
1886	ldmia	r10!,{r0-r9}
1887	stmia	r11!,   {r0-r9}
1888	ldmia	r10!,{r0-r9}
1889	stmia	r11!,   {r0-r9}
1890	ldmia	r10!,{r0-r9}
1891	stmia	r11!,   {r0-r9}
1892	ldmia	r10!,{r0-r9}
1893	stmia	r11,    {r0-r9}
1894
1895	ldr	r11,[sp,#476]		@ restore r11
1896#ifdef	__thumb2__
1897	mov	r9,#0x00ff00ff
1898	mov	r8,#0x0f0f0f0f
1899	mov	r7,#0x33333333
1900	mov	r6,#0x55555555
1901#else
1902	mov	r6,#0x11		@ compose constants
1903	mov	r8,#0x0f
1904	mov	r9,#0xff
1905	orr	r6,r6,r6,lsl#8
1906	orr	r8,r8,r8,lsl#8
1907	orr	r6,r6,r6,lsl#16		@ 0x11111111
1908	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
1909	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
1910	orr	r7,r6,r6,lsl#1		@ 0x33333333
1911	orr	r6,r6,r6,lsl#2		@ 0x55555555
1912#endif
1913	str	r9,[sp,#468]
1914	str	r8,[sp,#464]
1915	str	r7,[sp,#460]
1916	str	r6,[sp,#456]
1917	b	.Loop_absorb
1918
1919.align	4
1920.Loop_absorb:
1921	subs	r0,r12,r14
1922	blo	.Labsorbed
1923	add	r10,sp,#0
1924	str	r0,[sp,#480]		@ save len - bsz
1925
1926.align	4
1927.Loop_block:
1928	ldrb	r0,[r11],#1
1929	ldrb	r1,[r11],#1
1930	ldrb	r2,[r11],#1
1931	ldrb	r3,[r11],#1
1932	ldrb	r4,[r11],#1
1933	orr	r0,r0,r1,lsl#8
1934	ldrb	r1,[r11],#1
1935	orr	r0,r0,r2,lsl#16
1936	ldrb	r2,[r11],#1
1937	orr	r0,r0,r3,lsl#24		@ lo
1938	ldrb	r3,[r11],#1
1939	orr	r1,r4,r1,lsl#8
1940	orr	r1,r1,r2,lsl#16
1941	orr	r1,r1,r3,lsl#24		@ hi
1942
1943	and	r2,r0,r6		@ &=0x55555555
1944	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
1945	and	r3,r1,r6		@ &=0x55555555
1946	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1947	orr	r2,r2,r2,lsr#1
1948	orr	r0,r0,r0,lsl#1
1949	orr	r3,r3,r3,lsr#1
1950	orr	r1,r1,r1,lsl#1
1951	and	r2,r2,r7		@ &=0x33333333
1952	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
1953	and	r3,r3,r7		@ &=0x33333333
1954	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1955	orr	r2,r2,r2,lsr#2
1956	orr	r0,r0,r0,lsl#2
1957	orr	r3,r3,r3,lsr#2
1958	orr	r1,r1,r1,lsl#2
1959	and	r2,r2,r8		@ &=0x0f0f0f0f
1960	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
1961	and	r3,r3,r8		@ &=0x0f0f0f0f
1962	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
1963	ldmia	r10,{r4-r5}		@ A_flat[i]
1964	orr	r2,r2,r2,lsr#4
1965	orr	r0,r0,r0,lsl#4
1966	orr	r3,r3,r3,lsr#4
1967	orr	r1,r1,r1,lsl#4
1968	and	r2,r2,r9		@ &=0x00ff00ff
1969	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
1970	and	r3,r3,r9		@ &=0x00ff00ff
1971	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
1972	orr	r2,r2,r2,lsr#8
1973	orr	r0,r0,r0,lsl#8
1974	orr	r3,r3,r3,lsr#8
1975	orr	r1,r1,r1,lsl#8
1976
1977	mov	r2,r2,lsl#16
1978	mov	r1,r1,lsr#16
1979	eor	r4,r4,r3,lsl#16
1980	eor	r5,r5,r0,lsr#16
1981	eor	r4,r4,r2,lsr#16
1982	eor	r5,r5,r1,lsl#16
1983	stmia	r10!,{r4-r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
1984
1985	subs	r14,r14,#8
1986	bhi	.Loop_block
1987
1988	str	r11,[sp,#476]
1989
1990	bl	KeccakF1600_int
1991
1992	add	r14,sp,#456
1993	ldmia	r14,{r6-r12,r14}	@ restore constants and variables
1994	b	.Loop_absorb
1995
1996.align	4
1997.Labsorbed:
1998	add	r11,sp,#40
1999	ldmia	sp,      {r0-r9}
2000	stmia	r10!,{r0-r9}	@ return A[5][5]
2001	ldmia	r11!,   {r0-r9}
2002	stmia	r10!,{r0-r9}
2003	ldmia	r11!,   {r0-r9}
2004	stmia	r10!,{r0-r9}
2005	ldmia	r11!,   {r0-r9}
2006	stmia	r10!,{r0-r9}
2007	ldmia	r11,    {r0-r9}
2008	stmia	r10, {r0-r9}
2009
2010.Labsorb_abort:
2011	add	sp,sp,#456+32
2012	mov	r0,r12			@ return value
2013	ldmia	sp!,{r4-r12,pc}
2014.size	SHA3_absorb,.-SHA3_absorb
2015.global	SHA3_squeeze
2016.type	SHA3_squeeze,%function
2017.align	5
2018SHA3_squeeze:
2019	stmdb	sp!,{r0,r3-r10,lr}
2020
2021	mov	r10,r0
2022	mov	r4,r1
2023	mov	r5,r2
2024	mov	r12,r3
2025
2026#ifdef	__thumb2__
2027	mov	r9,#0x00ff00ff
2028	mov	r8,#0x0f0f0f0f
2029	mov	r7,#0x33333333
2030	mov	r6,#0x55555555
2031#else
2032	mov	r6,#0x11		@ compose constants
2033	mov	r8,#0x0f
2034	mov	r9,#0xff
2035	orr	r6,r6,r6,lsl#8
2036	orr	r8,r8,r8,lsl#8
2037	orr	r6,r6,r6,lsl#16		@ 0x11111111
2038	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
2039	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
2040	orr	r7,r6,r6,lsl#1		@ 0x33333333
2041	orr	r6,r6,r6,lsl#2		@ 0x55555555
2042#endif
2043	stmdb	sp!,{r6-r9}
2044
2045	mov	r14,r10
2046	b	.Loop_squeeze
2047
2048.align	4
2049.Loop_squeeze:
2050	ldmia	r10!,{r0,r1}	@ A_flat[i++]
2051
2052	mov	r2,r0,lsl#16
2053	mov	r3,r1,lsl#16		@ r3 = r1 << 16
2054	mov	r2,r2,lsr#16		@ r2 = r0 & 0x0000ffff
2055	mov	r1,r1,lsr#16
2056	mov	r0,r0,lsr#16		@ r0 = r0 >> 16
2057	mov	r1,r1,lsl#16		@ r1 = r1 & 0xffff0000
2058
2059	orr	r2,r2,r2,lsl#8
2060	orr	r3,r3,r3,lsr#8
2061	orr	r0,r0,r0,lsl#8
2062	orr	r1,r1,r1,lsr#8
2063	and	r2,r2,r9		@ &=0x00ff00ff
2064	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
2065	and	r0,r0,r9		@ &=0x00ff00ff
2066	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
2067	orr	r2,r2,r2,lsl#4
2068	orr	r3,r3,r3,lsr#4
2069	orr	r0,r0,r0,lsl#4
2070	orr	r1,r1,r1,lsr#4
2071	and	r2,r2,r8		@ &=0x0f0f0f0f
2072	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
2073	and	r0,r0,r8		@ &=0x0f0f0f0f
2074	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
2075	orr	r2,r2,r2,lsl#2
2076	orr	r3,r3,r3,lsr#2
2077	orr	r0,r0,r0,lsl#2
2078	orr	r1,r1,r1,lsr#2
2079	and	r2,r2,r7		@ &=0x33333333
2080	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
2081	and	r0,r0,r7		@ &=0x33333333
2082	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
2083	orr	r2,r2,r2,lsl#1
2084	orr	r3,r3,r3,lsr#1
2085	orr	r0,r0,r0,lsl#1
2086	orr	r1,r1,r1,lsr#1
2087	and	r2,r2,r6		@ &=0x55555555
2088	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
2089	and	r0,r0,r6		@ &=0x55555555
2090	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
2091
2092	orr	r2,r2,r3
2093	orr	r0,r0,r1
2094
2095	cmp	r5,#8
2096	blo	.Lsqueeze_tail
2097	mov	r1,r2,lsr#8
2098	strb	r2,[r4],#1
2099	mov	r3,r2,lsr#16
2100	strb	r1,[r4],#1
2101	mov	r2,r2,lsr#24
2102	strb	r3,[r4],#1
2103	strb	r2,[r4],#1
2104
2105	mov	r1,r0,lsr#8
2106	strb	r0,[r4],#1
2107	mov	r3,r0,lsr#16
2108	strb	r1,[r4],#1
2109	mov	r0,r0,lsr#24
2110	strb	r3,[r4],#1
2111	strb	r0,[r4],#1
2112	subs	r5,r5,#8
2113	beq	.Lsqueeze_done
2114
2115	subs	r12,r12,#8		@ bsz -= 8
2116	bhi	.Loop_squeeze
2117
2118	mov	r0,r14			@ original r10
2119
2120	bl	KeccakF1600
2121
2122	ldmia	sp,{r6-r10,r12}		@ restore constants and variables
2123	mov	r14,r10
2124	b	.Loop_squeeze
2125
2126.align	4
2127.Lsqueeze_tail:
2128	strb	r2,[r4],#1
2129	mov	r2,r2,lsr#8
2130	subs	r5,r5,#1
2131	beq	.Lsqueeze_done
2132	strb	r2,[r4],#1
2133	mov	r2,r2,lsr#8
2134	subs	r5,r5,#1
2135	beq	.Lsqueeze_done
2136	strb	r2,[r4],#1
2137	mov	r2,r2,lsr#8
2138	subs	r5,r5,#1
2139	beq	.Lsqueeze_done
2140	strb	r2,[r4],#1
2141	subs	r5,r5,#1
2142	beq	.Lsqueeze_done
2143
2144	strb	r0,[r4],#1
2145	mov	r0,r0,lsr#8
2146	subs	r5,r5,#1
2147	beq	.Lsqueeze_done
2148	strb	r0,[r4],#1
2149	mov	r0,r0,lsr#8
2150	subs	r5,r5,#1
2151	beq	.Lsqueeze_done
2152	strb	r0,[r4]
2153	b	.Lsqueeze_done
2154
2155.align	4
2156.Lsqueeze_done:
2157	add	sp,sp,#24
2158	ldmia	sp!,{r4-r10,pc}
2159.size	SHA3_squeeze,.-SHA3_squeeze
2160#if __ARM_MAX_ARCH__>=7
2161.fpu	neon
2162
2163.type	iotas64, %object
2164.align 5
2165iotas64:
2166	.quad	0x0000000000000001
2167	.quad	0x0000000000008082
2168	.quad	0x800000000000808a
2169	.quad	0x8000000080008000
2170	.quad	0x000000000000808b
2171	.quad	0x0000000080000001
2172	.quad	0x8000000080008081
2173	.quad	0x8000000000008009
2174	.quad	0x000000000000008a
2175	.quad	0x0000000000000088
2176	.quad	0x0000000080008009
2177	.quad	0x000000008000000a
2178	.quad	0x000000008000808b
2179	.quad	0x800000000000008b
2180	.quad	0x8000000000008089
2181	.quad	0x8000000000008003
2182	.quad	0x8000000000008002
2183	.quad	0x8000000000000080
2184	.quad	0x000000000000800a
2185	.quad	0x800000008000000a
2186	.quad	0x8000000080008081
2187	.quad	0x8000000000008080
2188	.quad	0x0000000080000001
2189	.quad	0x8000000080008008
2190.size	iotas64,.-iotas64
2191
2192.type	KeccakF1600_neon, %function
2193.align	5
2194KeccakF1600_neon:
2195	add	r1, r0, #16
2196	adr	r2, iotas64
2197	mov	r3, #24			@ loop counter
2198	b	.Loop_neon
2199
2200.align	4
2201.Loop_neon:
2202	@ Theta
2203	vst1.64		{q4},  [r0,:64]		@ offload A[0..1][4]
2204	veor		q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
2205	vst1.64		{d18}, [r1,:64]		@ offload A[2][4]
2206	veor		q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
2207	veor		q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
2208	veor		d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
2209	veor		d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
2210	veor		q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
2211	veor		q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
2212	veor		d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
2213	veor		d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
2214	veor		d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
2215	veor		q13, q13, q10		@ C[0..1]^=A[4][0..1]
2216	veor		q14, q15, q11		@ C[2..3]^=A[4][2..3]
2217	veor		d25, d25, d24		@ C[4]^=A[4][4]
2218
2219	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
2220	vadd.u64	q15, q14, q14		@ C[2..3]<<1
2221	vadd.u64	d18, d25, d25		@ C[4]<<1
2222	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
2223	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
2224	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
2225	veor		d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
2226	veor		q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
2227	veor		d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
2228	veor		d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
2229
2230	veor		d0,  d0,  d25		@ A[0][0] ^= C[4]
2231	veor		d1,  d1,  d25		@ A[1][0] ^= C[4]
2232	veor		d10, d10, d25		@ A[2][0] ^= C[4]
2233	veor		d11, d11, d25		@ A[3][0] ^= C[4]
2234	veor		d20, d20, d25		@ A[4][0] ^= C[4]
2235
2236	veor		d2,  d2,  d26		@ A[0][1] ^= D[1]
2237	veor		d3,  d3,  d26		@ A[1][1] ^= D[1]
2238	veor		d12, d12, d26		@ A[2][1] ^= D[1]
2239	veor		d13, d13, d26		@ A[3][1] ^= D[1]
2240	veor		d21, d21, d26		@ A[4][1] ^= D[1]
2241	vmov		d26, d27
2242
2243	veor		d6,  d6,  d28		@ A[0][3] ^= C[2]
2244	veor		d7,  d7,  d28		@ A[1][3] ^= C[2]
2245	veor		d16, d16, d28		@ A[2][3] ^= C[2]
2246	veor		d17, d17, d28		@ A[3][3] ^= C[2]
2247	veor		d23, d23, d28		@ A[4][3] ^= C[2]
2248	vld1.64		{q4},  [r0,:64]		@ restore A[0..1][4]
2249	vmov		d28, d29
2250
2251	vld1.64		{d18}, [r1,:64]		@ restore A[2][4]
2252	veor		q2,  q2,  q13		@ A[0..1][2] ^= D[2]
2253	veor		q7,  q7,  q13		@ A[2..3][2] ^= D[2]
2254	veor		d22, d22, d27		@ A[4][2]    ^= D[2]
2255
2256	veor		q4,  q4,  q14		@ A[0..1][4] ^= C[3]
2257	veor		q9,  q9,  q14		@ A[2..3][4] ^= C[3]
2258	veor		d24, d24, d29		@ A[4][4]    ^= C[3]
2259
2260	@ Rho + Pi
2261	vmov		d26, d2			@ C[1] = A[0][1]
2262	vshl.u64	d2,  d3,  #44
2263	vmov		d27, d4			@ C[2] = A[0][2]
2264	vshl.u64	d4,  d14, #43
2265	vmov		d28, d6			@ C[3] = A[0][3]
2266	vshl.u64	d6,  d17, #21
2267	vmov		d29, d8			@ C[4] = A[0][4]
2268	vshl.u64	d8,  d24, #14
2269	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
2270	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
2271	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
2272	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
2273
2274	vshl.u64	d3,  d9,  #20
2275	vshl.u64	d14, d16, #25
2276	vshl.u64	d17, d15, #15
2277	vshl.u64	d24, d21, #2
2278	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
2279	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
2280	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
2281	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
2282
2283	vshl.u64	d9,  d22, #61
2284	@ vshl.u64	d16, d19, #8
2285	vshl.u64	d15, d12, #10
2286	vshl.u64	d21, d7,  #55
2287	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
2288	vext.8		d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
2289	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
2290	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
2291
2292	vshl.u64	d22, d18, #39
2293	@ vshl.u64	d19, d23, #56
2294	vshl.u64	d12, d5,  #6
2295	vshl.u64	d7,  d13, #45
2296	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
2297	vext.8		d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
2298	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
2299	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
2300
2301	vshl.u64	d18, d20, #18
2302	vshl.u64	d23, d11, #41
2303	vshl.u64	d5,  d10, #3
2304	vshl.u64	d13, d1,  #36
2305	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
2306	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
2307	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
2308	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
2309
2310	vshl.u64	d1,  d28, #28
2311	vshl.u64	d10, d26, #1
2312	vshl.u64	d11, d29, #27
2313	vshl.u64	d20, d27, #62
2314	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
2315	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
2316	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
2317	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
2318
2319	@ Chi + Iota
2320	vbic		q13, q2,  q1
2321	vbic		q14, q3,  q2
2322	vbic		q15, q4,  q3
2323	veor		q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
2324	veor		q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
2325	veor		q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
2326	vst1.64		{q13}, [r0,:64]		@ offload A[0..1][0]
2327	vbic		q13, q0,  q4
2328	vbic		q15, q1,  q0
2329	vmov		q1,  q14		@ A[0..1][1]
2330	veor		q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
2331	veor		q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
2332
2333	vbic		q13, q7,  q6
2334	vmov		q0,  q5			@ A[2..3][0]
2335	vbic		q14, q8,  q7
2336	vmov		q15, q6			@ A[2..3][1]
2337	veor		q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
2338	vbic		q13, q9,  q8
2339	veor		q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
2340	vbic		q14, q0,  q9
2341	veor		q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
2342	vbic		q13, q15, q0
2343	veor		q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
2344	vmov		q14, q10		@ A[4][0..1]
2345	veor		q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
2346
2347	vld1.64		d25, [r2,:64]!		@ Iota[i++]
2348	vbic		d26, d22, d21
2349	vbic		d27, d23, d22
2350	vld1.64		{q0}, [r0,:64]		@ restore A[0..1][0]
2351	veor		d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
2352	vbic		d26, d24, d23
2353	veor		d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
2354	vbic		d27, d28, d24
2355	veor		d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
2356	vbic		d26, d29, d28
2357	veor		d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
2358	veor		d0,  d0,  d25		@ A[0][0] ^= Iota[i]
2359	veor		d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
2360
2361	subs	r3, r3, #1
2362	bne	.Loop_neon
2363
2364	.word	0xe12fff1e
2365.size	KeccakF1600_neon,.-KeccakF1600_neon
2366
2367.global	SHA3_absorb_neon
2368.type	SHA3_absorb_neon, %function
2369.align	5
2370SHA3_absorb_neon:
2371	stmdb	sp!, {r4-r6,lr}
2372	vstmdb	sp!, {d8-d15}
2373
2374	mov	r4, r1			@ inp
2375	mov	r5, r2			@ len
2376	mov	r6, r3			@ bsz
2377
2378	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
2379	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
2380	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
2381	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
2382	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
2383
2384	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
2385	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
2386	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
2387	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
2388	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
2389
2390	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
2391	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
2392	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
2393	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
2394	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
2395
2396	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
2397	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
2398	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
2399	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
2400	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
2401
2402	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..3]
2403	vld1.32	{d24}, [r0,:64]		@ A[4][4]
2404	sub	r0, r0, #24*8		@ rewind
2405	b	.Loop_absorb_neon
2406
2407.align	4
2408.Loop_absorb_neon:
2409	subs	r12, r5, r6		@ len - bsz
2410	blo	.Labsorbed_neon
2411	mov	r5, r12
2412
2413	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
2414	cmp	r6, #8*2
2415	veor	d0, d0, d31		@ A[0][0] ^= *inp++
2416	blo	.Lprocess_neon
2417	vld1.8	{d31}, [r4]!
2418	veor	d2, d2, d31		@ A[0][1] ^= *inp++
2419	beq	.Lprocess_neon
2420	vld1.8	{d31}, [r4]!
2421	cmp	r6, #8*4
2422	veor	d4, d4, d31		@ A[0][2] ^= *inp++
2423	blo	.Lprocess_neon
2424	vld1.8	{d31}, [r4]!
2425	veor	d6, d6, d31		@ A[0][3] ^= *inp++
2426	beq	.Lprocess_neon
2427	vld1.8	{d31},[r4]!
2428	cmp	r6, #8*6
2429	veor	d8, d8, d31		@ A[0][4] ^= *inp++
2430	blo	.Lprocess_neon
2431
2432	vld1.8	{d31}, [r4]!
2433	veor	d1, d1, d31		@ A[1][0] ^= *inp++
2434	beq	.Lprocess_neon
2435	vld1.8	{d31}, [r4]!
2436	cmp	r6, #8*8
2437	veor	d3, d3, d31		@ A[1][1] ^= *inp++
2438	blo	.Lprocess_neon
2439	vld1.8	{d31}, [r4]!
2440	veor	d5, d5, d31		@ A[1][2] ^= *inp++
2441	beq	.Lprocess_neon
2442	vld1.8	{d31}, [r4]!
2443	cmp	r6, #8*10
2444	veor	d7, d7, d31		@ A[1][3] ^= *inp++
2445	blo	.Lprocess_neon
2446	vld1.8	{d31}, [r4]!
2447	veor	d9, d9, d31		@ A[1][4] ^= *inp++
2448	beq	.Lprocess_neon
2449
2450	vld1.8	{d31}, [r4]!
2451	cmp	r6, #8*12
2452	veor	d10, d10, d31		@ A[2][0] ^= *inp++
2453	blo	.Lprocess_neon
2454	vld1.8	{d31}, [r4]!
2455	veor	d12, d12, d31		@ A[2][1] ^= *inp++
2456	beq	.Lprocess_neon
2457	vld1.8	{d31}, [r4]!
2458	cmp	r6, #8*14
2459	veor	d14, d14, d31		@ A[2][2] ^= *inp++
2460	blo	.Lprocess_neon
2461	vld1.8	{d31}, [r4]!
2462	veor	d16, d16, d31		@ A[2][3] ^= *inp++
2463	beq	.Lprocess_neon
2464	vld1.8	{d31}, [r4]!
2465	cmp	r6, #8*16
2466	veor	d18, d18, d31		@ A[2][4] ^= *inp++
2467	blo	.Lprocess_neon
2468
2469	vld1.8	{d31}, [r4]!
2470	veor	d11, d11, d31		@ A[3][0] ^= *inp++
2471	beq	.Lprocess_neon
2472	vld1.8	{d31}, [r4]!
2473	cmp	r6, #8*18
2474	veor	d13, d13, d31		@ A[3][1] ^= *inp++
2475	blo	.Lprocess_neon
2476	vld1.8	{d31}, [r4]!
2477	veor	d15, d15, d31		@ A[3][2] ^= *inp++
2478	beq	.Lprocess_neon
2479	vld1.8	{d31}, [r4]!
2480	cmp	r6, #8*20
2481	veor	d17, d17, d31		@ A[3][3] ^= *inp++
2482	blo	.Lprocess_neon
2483	vld1.8	{d31}, [r4]!
2484	veor	d19, d19, d31		@ A[3][4] ^= *inp++
2485	beq	.Lprocess_neon
2486
2487	vld1.8	{d31}, [r4]!
2488	cmp	r6, #8*22
2489	veor	d20, d20, d31		@ A[4][0] ^= *inp++
2490	blo	.Lprocess_neon
2491	vld1.8	{d31}, [r4]!
2492	veor	d21, d21, d31		@ A[4][1] ^= *inp++
2493	beq	.Lprocess_neon
2494	vld1.8	{d31}, [r4]!
2495	cmp	r6, #8*24
2496	veor	d22, d22, d31		@ A[4][2] ^= *inp++
2497	blo	.Lprocess_neon
2498	vld1.8	{d31}, [r4]!
2499	veor	d23, d23, d31		@ A[4][3] ^= *inp++
2500	beq	.Lprocess_neon
2501	vld1.8	{d31}, [r4]!
2502	veor	d24, d24, d31		@ A[4][4] ^= *inp++
2503
2504.Lprocess_neon:
2505	bl	KeccakF1600_neon
2506	b 	.Loop_absorb_neon
2507
2508.align	4
2509.Labsorbed_neon:
2510	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2511	vst1.32	{d2}, [r0,:64]!
2512	vst1.32	{d4}, [r0,:64]!
2513	vst1.32	{d6}, [r0,:64]!
2514	vst1.32	{d8}, [r0,:64]!
2515
2516	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2517	vst1.32	{d3}, [r0,:64]!
2518	vst1.32	{d5}, [r0,:64]!
2519	vst1.32	{d7}, [r0,:64]!
2520	vst1.32	{d9}, [r0,:64]!
2521
2522	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2523	vst1.32	{d12}, [r0,:64]!
2524	vst1.32	{d14}, [r0,:64]!
2525	vst1.32	{d16}, [r0,:64]!
2526	vst1.32	{d18}, [r0,:64]!
2527
2528	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2529	vst1.32	{d13}, [r0,:64]!
2530	vst1.32	{d15}, [r0,:64]!
2531	vst1.32	{d17}, [r0,:64]!
2532	vst1.32	{d19}, [r0,:64]!
2533
2534	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
2535	vst1.32	{d24}, [r0,:64]
2536
2537	mov	r0, r5			@ return value
2538	vldmia	sp!, {d8-d15}
2539	ldmia	sp!, {r4-r6,pc}
2540.size	SHA3_absorb_neon,.-SHA3_absorb_neon
2541
2542.global	SHA3_squeeze_neon
2543.type	SHA3_squeeze_neon, %function
2544.align	5
2545SHA3_squeeze_neon:
2546	stmdb	sp!, {r4-r6,lr}
2547
2548	mov	r4, r1			@ out
2549	mov	r5, r2			@ len
2550	mov	r6, r3			@ bsz
2551	mov	r12, r0			@ A_flat
2552	mov	r14, r3			@ bsz
2553	b	.Loop_squeeze_neon
2554
2555.align	4
2556.Loop_squeeze_neon:
2557	cmp	r5, #8
2558	blo	.Lsqueeze_neon_tail
2559	vld1.32	{d0}, [r12]!
2560	vst1.8	{d0}, [r4]!		@ endian-neutral store
2561
2562	subs	r5, r5, #8		@ len -= 8
2563	beq	.Lsqueeze_neon_done
2564
2565	subs	r14, r14, #8		@ bsz -= 8
2566	bhi	.Loop_squeeze_neon
2567
2568	vstmdb	sp!,  {d8-d15}
2569
2570	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2571	vld1.32	{d2}, [r0,:64]!
2572	vld1.32	{d4}, [r0,:64]!
2573	vld1.32	{d6}, [r0,:64]!
2574	vld1.32	{d8}, [r0,:64]!
2575
2576	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2577	vld1.32	{d3}, [r0,:64]!
2578	vld1.32	{d5}, [r0,:64]!
2579	vld1.32	{d7}, [r0,:64]!
2580	vld1.32	{d9}, [r0,:64]!
2581
2582	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2583	vld1.32	{d12}, [r0,:64]!
2584	vld1.32	{d14}, [r0,:64]!
2585	vld1.32	{d16}, [r0,:64]!
2586	vld1.32	{d18}, [r0,:64]!
2587
2588	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2589	vld1.32	{d13}, [r0,:64]!
2590	vld1.32	{d15}, [r0,:64]!
2591	vld1.32	{d17}, [r0,:64]!
2592	vld1.32	{d19}, [r0,:64]!
2593
2594	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
2595	vld1.32	{d24}, [r0,:64]
2596	sub	r0, r0, #24*8		@ rewind
2597
2598	bl	KeccakF1600_neon
2599
2600	mov	r12, r0			@ A_flat
2601	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2602	vst1.32	{d2}, [r0,:64]!
2603	vst1.32	{d4}, [r0,:64]!
2604	vst1.32	{d6}, [r0,:64]!
2605	vst1.32	{d8}, [r0,:64]!
2606
2607	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2608	vst1.32	{d3}, [r0,:64]!
2609	vst1.32	{d5}, [r0,:64]!
2610	vst1.32	{d7}, [r0,:64]!
2611	vst1.32	{d9}, [r0,:64]!
2612
2613	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2614	vst1.32	{d12}, [r0,:64]!
2615	vst1.32	{d14}, [r0,:64]!
2616	vst1.32	{d16}, [r0,:64]!
2617	vst1.32	{d18}, [r0,:64]!
2618
2619	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2620	vst1.32	{d13}, [r0,:64]!
2621	vst1.32	{d15}, [r0,:64]!
2622	vst1.32	{d17}, [r0,:64]!
2623	vst1.32	{d19}, [r0,:64]!
2624
2625	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
2626	mov	r14, r6			@ bsz
2627	vst1.32	{d24}, [r0,:64]
2628	mov	r0,  r12		@ rewind
2629
2630	vldmia	sp!, {d8-d15}
2631	b	.Loop_squeeze_neon
2632
2633.align	4
2634.Lsqueeze_neon_tail:
2635	ldmia	r12, {r2,r3}
2636	cmp	r5, #2
2637	strb	r2, [r4],#1		@ endian-neutral store
2638	mov	r2, r2, lsr#8
2639	blo	.Lsqueeze_neon_done
2640	strb	r2, [r4], #1
2641	mov	r2, r2, lsr#8
2642	beq	.Lsqueeze_neon_done
2643	strb	r2, [r4], #1
2644	mov	r2, r2, lsr#8
2645	cmp	r5, #4
2646	blo	.Lsqueeze_neon_done
2647	strb	r2, [r4], #1
2648	beq	.Lsqueeze_neon_done
2649
2650	strb	r3, [r4], #1
2651	mov	r3, r3, lsr#8
2652	cmp	r5, #6
2653	blo	.Lsqueeze_neon_done
2654	strb	r3, [r4], #1
2655	mov	r3, r3, lsr#8
2656	beq	.Lsqueeze_neon_done
2657	strb	r3, [r4], #1
2658
2659.Lsqueeze_neon_done:
2660	ldmia	sp!, {r4-r6,pc}
2661.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
2662#endif
2663.asciz	"Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
2664.align	2
2665