• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3#if defined(__thumb2__)
4.syntax	unified
5.thumb
6#else
7.code	32
8#endif
9
10.text
11
12.type	iotas32, %object
13.align	5
14iotas32:
15.long	0x00000001, 0x00000000
16.long	0x00000000, 0x00000089
17.long	0x00000000, 0x8000008b
18.long	0x00000000, 0x80008080
19.long	0x00000001, 0x0000008b
20.long	0x00000001, 0x00008000
21.long	0x00000001, 0x80008088
22.long	0x00000001, 0x80000082
23.long	0x00000000, 0x0000000b
24.long	0x00000000, 0x0000000a
25.long	0x00000001, 0x00008082
26.long	0x00000000, 0x00008003
27.long	0x00000001, 0x0000808b
28.long	0x00000001, 0x8000000b
29.long	0x00000001, 0x8000008a
30.long	0x00000001, 0x80000081
31.long	0x00000000, 0x80000081
32.long	0x00000000, 0x80000008
33.long	0x00000000, 0x00000083
34.long	0x00000000, 0x80008003
35.long	0x00000001, 0x80008088
36.long	0x00000000, 0x80000088
37.long	0x00000001, 0x00008000
38.long	0x00000000, 0x80008082
39.size	iotas32,.-iotas32
40
41.type	KeccakF1600_int, %function
42.align	5
43KeccakF1600_int:
44	add	r9,sp,#176
45	add	r12,sp,#0
46	add	r10,sp,#40
47	ldmia	r9,{r4,r5,r6,r7,r8,r9}		@ A[4][2..4]
48KeccakF1600_enter:
49	str	lr,[sp,#440]
50	eor	r11,r11,r11
51	str	r11,[sp,#444]
52	b	.Lround2x
53
54.align	4
55.Lround2x:
56	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
57	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
58#ifdef	__thumb2__
59	eor	r0,r0,r10
60	eor	r1,r1,r11
61	eor	r2,r2,r12
62	ldrd	r10,r11,[sp,#56]
63	eor	r3,r3,r14
64	ldrd	r12,r14,[sp,#64]
65	eor	r4,r4,r10
66	eor	r5,r5,r11
67	eor	r6,r6,r12
68	ldrd	r10,r11,[sp,#72]
69	eor	r7,r7,r14
70	ldrd	r12,r14,[sp,#80]
71	eor	r8,r8,r10
72	eor	r9,r9,r11
73	eor	r0,r0,r12
74	ldrd	r10,r11,[sp,#88]
75	eor	r1,r1,r14
76	ldrd	r12,r14,[sp,#96]
77	eor	r2,r2,r10
78	eor	r3,r3,r11
79	eor	r4,r4,r12
80	ldrd	r10,r11,[sp,#104]
81	eor	r5,r5,r14
82	ldrd	r12,r14,[sp,#112]
83	eor	r6,r6,r10
84	eor	r7,r7,r11
85	eor	r8,r8,r12
86	ldrd	r10,r11,[sp,#120]
87	eor	r9,r9,r14
88	ldrd	r12,r14,[sp,#128]
89	eor	r0,r0,r10
90	eor	r1,r1,r11
91	eor	r2,r2,r12
92	ldrd	r10,r11,[sp,#136]
93	eor	r3,r3,r14
94	ldrd	r12,r14,[sp,#144]
95	eor	r4,r4,r10
96	eor	r5,r5,r11
97	eor	r6,r6,r12
98	ldrd	r10,r11,[sp,#152]
99	eor	r7,r7,r14
100	ldrd	r12,r14,[sp,#160]
101	eor	r8,r8,r10
102	eor	r9,r9,r11
103	eor	r0,r0,r12
104	ldrd	r10,r11,[sp,#168]
105	eor	r1,r1,r14
106	ldrd	r12,r14,[sp,#16]
107	eor	r2,r2,r10
108	eor	r3,r3,r11
109	eor	r4,r4,r12
110	ldrd	r10,r11,[sp,#24]
111	eor	r5,r5,r14
112	ldrd	r12,r14,[sp,#32]
113#else
114	eor	r0,r0,r10
115	add	r10,sp,#56
116	eor	r1,r1,r11
117	eor	r2,r2,r12
118	eor	r3,r3,r14
119	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
120	eor	r4,r4,r10
121	add	r10,sp,#72
122	eor	r5,r5,r11
123	eor	r6,r6,r12
124	eor	r7,r7,r14
125	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
126	eor	r8,r8,r10
127	add	r10,sp,#88
128	eor	r9,r9,r11
129	eor	r0,r0,r12
130	eor	r1,r1,r14
131	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
132	eor	r2,r2,r10
133	add	r10,sp,#104
134	eor	r3,r3,r11
135	eor	r4,r4,r12
136	eor	r5,r5,r14
137	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
138	eor	r6,r6,r10
139	add	r10,sp,#120
140	eor	r7,r7,r11
141	eor	r8,r8,r12
142	eor	r9,r9,r14
143	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
144	eor	r0,r0,r10
145	add	r10,sp,#136
146	eor	r1,r1,r11
147	eor	r2,r2,r12
148	eor	r3,r3,r14
149	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
150	eor	r4,r4,r10
151	add	r10,sp,#152
152	eor	r5,r5,r11
153	eor	r6,r6,r12
154	eor	r7,r7,r14
155	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
156	eor	r8,r8,r10
157	ldr	r10,[sp,#168]		@ A[4][1]
158	eor	r9,r9,r11
159	ldr	r11,[sp,#168+4]
160	eor	r0,r0,r12
161	ldr	r12,[sp,#16]		@ A[0][2]
162	eor	r1,r1,r14
163	ldr	r14,[sp,#16+4]
164	eor	r2,r2,r10
165	add	r10,sp,#24
166	eor	r3,r3,r11
167	eor	r4,r4,r12
168	eor	r5,r5,r14
169	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
170#endif
171	eor	r6,r6,r10
172	eor	r7,r7,r11
173	eor	r8,r8,r12
174	eor	r9,r9,r14
175
176	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
177#ifndef	__thumb2__
178	str	r10,[sp,#208]		@ D[1] = E[0]
179#endif
180	eor	r11,r1,r4
181#ifndef	__thumb2__
182	str	r11,[sp,#208+4]
183#else
184	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
185#endif
186	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
187	eor	r14,r7,r0
188#ifndef	__thumb2__
189	str	r12,[sp,#232]		@ D[4] = E[1]
190#endif
191	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
192#ifndef	__thumb2__
193	str	r14,[sp,#232+4]
194#else
195	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
196#endif
197	eor	r1,r9,r2
198#ifndef	__thumb2__
199	str	r0,[sp,#200]		@ D[0] = C[0]
200#endif
201	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
202#ifndef	__thumb2__
203	ldr	r7,[sp,#144]
204#endif
205	eor	r3,r3,r6
206#ifndef	__thumb2__
207	str	r1,[sp,#200+4]
208#else
209	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
210#endif
211#ifndef	__thumb2__
212	ldr	r6,[sp,#144+4]
213#else
214	ldrd	r7,r6,[sp,#144]
215#endif
216#ifndef	__thumb2__
217	str	r2,[sp,#216]		@ D[2] = C[1]
218#endif
219	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
220#ifndef	__thumb2__
221	str	r3,[sp,#216+4]
222#else
223	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
224#endif
225	eor	r5,r5,r8
226
227#ifndef	__thumb2__
228	ldr	r8,[sp,#192]
229#endif
230#ifndef	__thumb2__
231	ldr	r9,[sp,#192+4]
232#else
233	ldrd	r8,r9,[sp,#192]
234#endif
235#ifndef	__thumb2__
236	str	r4,[sp,#224]		@ D[3] = C[2]
237#endif
238	eor	r7,r7,r4
239#ifndef	__thumb2__
240	str	r5,[sp,#224+4]
241#else
242	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
243#endif
244	eor	r6,r6,r5
245#ifndef	__thumb2__
246	ldr	r4,[sp,#0]
247#endif
248	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
249	@ mov	r6,r6,ror#32-11
250#ifndef	__thumb2__
251	ldr	r5,[sp,#0+4]
252#else
253	ldrd	r4,r5,[sp,#0]
254#endif
255	eor	r8,r8,r12
256	eor	r9,r9,r14
257#ifndef	__thumb2__
258	ldr	r12,[sp,#96]
259#endif
260	eor	r0,r0,r4
261#ifndef	__thumb2__
262	ldr	r14,[sp,#96+4]
263#else
264	ldrd	r12,r14,[sp,#96]
265#endif
266	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
267	@ mov	r9,r9,ror#32-7
268	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
269	eor	r12,r12,r2
270#ifndef	__thumb2__
271	ldr	r2,[sp,#48]
272#endif
273	eor	r14,r14,r3
274#ifndef	__thumb2__
275	ldr	r3,[sp,#48+4]
276#else
277	ldrd	r2,r3,[sp,#48]
278#endif
279	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
280	ldr	r12,[sp,#444]			@ load counter
281	eor	r2,r2,r10
282	adr	r10,iotas32
283	mov	r4,r14,ror#32-22
284	add	r14,r10,r12
285	eor	r3,r3,r11
286	ldmia	r14,{r10,r11}		@ iotas[i]
287	bic	r12,r4,r2,ror#32-22
288	bic	r14,r5,r3,ror#32-22
289	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
290	mov	r3,r3,ror#32-22
291	eor	r12,r12,r0
292	eor	r14,r14,r1
293	eor	r10,r10,r12
294	eor	r11,r11,r14
295#ifndef	__thumb2__
296	str	r10,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
297#endif
298	bic	r12,r6,r4,ror#11
299#ifndef	__thumb2__
300	str	r11,[sp,#240+4]
301#else
302	strd	r10,r11,[sp,#240]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
303#endif
304	bic	r14,r7,r5,ror#10
305	bic	r10,r8,r6,ror#32-(11-7)
306	bic	r11,r9,r7,ror#32-(10-7)
307	eor	r12,r2,r12,ror#32-11
308#ifndef	__thumb2__
309	str	r12,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
310#endif
311	eor	r14,r3,r14,ror#32-10
312#ifndef	__thumb2__
313	str	r14,[sp,#248+4]
314#else
315	strd	r12,r14,[sp,#248]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
316#endif
317	eor	r10,r4,r10,ror#32-7
318	eor	r11,r5,r11,ror#32-7
319#ifndef	__thumb2__
320	str	r10,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
321#endif
322	bic	r12,r0,r8,ror#32-7
323#ifndef	__thumb2__
324	str	r11,[sp,#256+4]
325#else
326	strd	r10,r11,[sp,#256]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
327#endif
328	bic	r14,r1,r9,ror#32-7
329	eor	r12,r12,r6,ror#32-11
330#ifndef	__thumb2__
331	str	r12,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
332#endif
333	eor	r14,r14,r7,ror#32-10
334#ifndef	__thumb2__
335	str	r14,[sp,#264+4]
336#else
337	strd	r12,r14,[sp,#264]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
338#endif
339	bic	r10,r2,r0
340	add	r14,sp,#224
341#ifndef	__thumb2__
342	ldr	r0,[sp,#24]		@ A[0][3]
343#endif
344	bic	r11,r3,r1
345#ifndef	__thumb2__
346	ldr	r1,[sp,#24+4]
347#else
348	ldrd	r0,r1,[sp,#24]		@ A[0][3]
349#endif
350	eor	r10,r10,r8,ror#32-7
351	eor	r11,r11,r9,ror#32-7
352#ifndef	__thumb2__
353	str	r10,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
354#endif
355	add	r9,sp,#200
356#ifndef	__thumb2__
357	str	r11,[sp,#272+4]
358#else
359	strd	r10,r11,[sp,#272]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
360#endif
361
362	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
363	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
364
365#ifndef	__thumb2__
366	ldr	r2,[sp,#72]		@ A[1][4]
367#endif
368	eor	r0,r0,r10
369#ifndef	__thumb2__
370	ldr	r3,[sp,#72+4]
371#else
372	ldrd	r2,r3,[sp,#72]		@ A[1][4]
373#endif
374	eor	r1,r1,r11
375	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
376#ifndef	__thumb2__
377	ldr	r10,[sp,#128]		@ A[3][1]
378#endif
379	@ mov	r1,r1,ror#32-14
380#ifndef	__thumb2__
381	ldr	r11,[sp,#128+4]
382#else
383	ldrd	r10,r11,[sp,#128]		@ A[3][1]
384#endif
385
386	eor	r2,r2,r12
387#ifndef	__thumb2__
388	ldr	r4,[sp,#80]		@ A[2][0]
389#endif
390	eor	r3,r3,r14
391#ifndef	__thumb2__
392	ldr	r5,[sp,#80+4]
393#else
394	ldrd	r4,r5,[sp,#80]		@ A[2][0]
395#endif
396	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
397	@ mov	r3,r3,ror#32-10
398
399	eor	r6,r6,r4
400#ifndef	__thumb2__
401	ldr	r12,[sp,#216]		@ D[2]
402#endif
403	eor	r7,r7,r5
404#ifndef	__thumb2__
405	ldr	r14,[sp,#216+4]
406#else
407	ldrd	r12,r14,[sp,#216]		@ D[2]
408#endif
409	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
410	mov	r4,r7,ror#32-2
411
412	eor	r10,r10,r8
413#ifndef	__thumb2__
414	ldr	r8,[sp,#176]		@ A[4][2]
415#endif
416	eor	r11,r11,r9
417#ifndef	__thumb2__
418	ldr	r9,[sp,#176+4]
419#else
420	ldrd	r8,r9,[sp,#176]		@ A[4][2]
421#endif
422	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
423	mov	r6,r11,ror#32-23
424
425	bic	r10,r4,r2,ror#32-10
426	bic	r11,r5,r3,ror#32-10
427	eor	r12,r12,r8
428	eor	r14,r14,r9
429	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
430	mov	r8,r14,ror#32-31
431	eor	r10,r10,r0,ror#32-14
432	eor	r11,r11,r1,ror#32-14
433#ifndef	__thumb2__
434	str	r10,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
435#endif
436	bic	r12,r6,r4
437#ifndef	__thumb2__
438	str	r11,[sp,#280+4]
439#else
440	strd	r10,r11,[sp,#280]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
441#endif
442	bic	r14,r7,r5
443	eor	r12,r12,r2,ror#32-10
444#ifndef	__thumb2__
445	str	r12,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
446#endif
447	eor	r14,r14,r3,ror#32-10
448#ifndef	__thumb2__
449	str	r14,[sp,#288+4]
450#else
451	strd	r12,r14,[sp,#288]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
452#endif
453	bic	r10,r8,r6
454	bic	r11,r9,r7
455	bic	r12,r0,r8,ror#14
456	bic	r14,r1,r9,ror#14
457	eor	r10,r10,r4
458	eor	r11,r11,r5
459#ifndef	__thumb2__
460	str	r10,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
461#endif
462	bic	r2,r2,r0,ror#32-(14-10)
463#ifndef	__thumb2__
464	str	r11,[sp,#296+4]
465#else
466	strd	r10,r11,[sp,#296]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
467#endif
468	eor	r12,r6,r12,ror#32-14
469	bic	r11,r3,r1,ror#32-(14-10)
470#ifndef	__thumb2__
471	str	r12,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
472#endif
473	eor	r14,r7,r14,ror#32-14
474#ifndef	__thumb2__
475	str	r14,[sp,#304+4]
476#else
477	strd	r12,r14,[sp,#304]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
478#endif
479	add	r12,sp,#208
480#ifndef	__thumb2__
481	ldr	r1,[sp,#8]		@ A[0][1]
482#endif
483	eor	r10,r8,r2,ror#32-10
484#ifndef	__thumb2__
485	ldr	r0,[sp,#8+4]
486#else
487	ldrd	r1,r0,[sp,#8]		@ A[0][1]
488#endif
489	eor	r11,r9,r11,ror#32-10
490#ifndef	__thumb2__
491	str	r10,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
492#endif
493#ifndef	__thumb2__
494	str	r11,[sp,#312+4]
495#else
496	strd	r10,r11,[sp,#312]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
497#endif
498
499	add	r9,sp,#224
500	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
501#ifndef	__thumb2__
502	ldr	r2,[sp,#56]		@ A[1][2]
503#endif
504#ifndef	__thumb2__
505	ldr	r3,[sp,#56+4]
506#else
507	ldrd	r2,r3,[sp,#56]		@ A[1][2]
508#endif
509	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
510
511	eor	r1,r1,r10
512#ifndef	__thumb2__
513	ldr	r4,[sp,#104]		@ A[2][3]
514#endif
515	eor	r0,r0,r11
516#ifndef	__thumb2__
517	ldr	r5,[sp,#104+4]
518#else
519	ldrd	r4,r5,[sp,#104]		@ A[2][3]
520#endif
521	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
522
523	eor	r2,r2,r12
524#ifndef	__thumb2__
525	ldr	r10,[sp,#152]		@ A[3][4]
526#endif
527	eor	r3,r3,r14
528#ifndef	__thumb2__
529	ldr	r11,[sp,#152+4]
530#else
531	ldrd	r10,r11,[sp,#152]		@ A[3][4]
532#endif
533	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
534#ifndef	__thumb2__
535	ldr	r12,[sp,#200]		@ D[0]
536#endif
537	@ mov	r3,r3,ror#32-3
538#ifndef	__thumb2__
539	ldr	r14,[sp,#200+4]
540#else
541	ldrd	r12,r14,[sp,#200]		@ D[0]
542#endif
543
544	eor	r4,r4,r6
545	eor	r5,r5,r7
546	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
547	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
548
549	eor	r10,r10,r8
550#ifndef	__thumb2__
551	ldr	r8,[sp,#160]		@ A[4][0]
552#endif
553	eor	r11,r11,r9
554#ifndef	__thumb2__
555	ldr	r9,[sp,#160+4]
556#else
557	ldrd	r8,r9,[sp,#160]		@ A[4][0]
558#endif
559	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
560	mov	r7,r11,ror#32-4
561
562	eor	r12,r12,r8
563	eor	r14,r14,r9
564	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
565	mov	r9,r14,ror#32-9
566
567	bic	r10,r5,r2,ror#13-3
568	bic	r11,r4,r3,ror#12-3
569	bic	r12,r6,r5,ror#32-13
570	bic	r14,r7,r4,ror#32-12
571	eor	r10,r0,r10,ror#32-13
572	eor	r11,r1,r11,ror#32-12
573#ifndef	__thumb2__
574	str	r10,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
575#endif
576	eor	r12,r12,r2,ror#32-3
577#ifndef	__thumb2__
578	str	r11,[sp,#320+4]
579#else
580	strd	r10,r11,[sp,#320]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
581#endif
582	eor	r14,r14,r3,ror#32-3
583#ifndef	__thumb2__
584	str	r12,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
585#endif
586	bic	r10,r8,r6
587	bic	r11,r9,r7
588#ifndef	__thumb2__
589	str	r14,[sp,#328+4]
590#else
591	strd	r12,r14,[sp,#328]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
592#endif
593	eor	r10,r10,r5,ror#32-13
594	eor	r11,r11,r4,ror#32-12
595#ifndef	__thumb2__
596	str	r10,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
597#endif
598	bic	r12,r0,r8
599#ifndef	__thumb2__
600	str	r11,[sp,#336+4]
601#else
602	strd	r10,r11,[sp,#336]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
603#endif
604	bic	r14,r1,r9
605	eor	r12,r12,r6
606	eor	r14,r14,r7
607#ifndef	__thumb2__
608	str	r12,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
609#endif
610	bic	r10,r2,r0,ror#3
611#ifndef	__thumb2__
612	str	r14,[sp,#344+4]
613#else
614	strd	r12,r14,[sp,#344]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
615#endif
616	bic	r11,r3,r1,ror#3
617#ifndef	__thumb2__
618	ldr	r1,[sp,#32]		@ A[0][4] [in reverse order]
619#endif
620	eor	r10,r8,r10,ror#32-3
621#ifndef	__thumb2__
622	ldr	r0,[sp,#32+4]
623#else
624	ldrd	r1,r0,[sp,#32]		@ A[0][4] [in reverse order]
625#endif
626	eor	r11,r9,r11,ror#32-3
627#ifndef	__thumb2__
628	str	r10,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
629#endif
630	add	r9,sp,#208
631#ifndef	__thumb2__
632	str	r11,[sp,#352+4]
633#else
634	strd	r10,r11,[sp,#352]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
635#endif
636
637#ifndef	__thumb2__
638	ldr	r10,[sp,#232]		@ D[4]
639#endif
640#ifndef	__thumb2__
641	ldr	r11,[sp,#232+4]
642#else
643	ldrd	r10,r11,[sp,#232]		@ D[4]
644#endif
645#ifndef	__thumb2__
646	ldr	r12,[sp,#200]		@ D[0]
647#endif
648#ifndef	__thumb2__
649	ldr	r14,[sp,#200+4]
650#else
651	ldrd	r12,r14,[sp,#200]		@ D[0]
652#endif
653
654	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
655
656	eor	r1,r1,r10
657#ifndef	__thumb2__
658	ldr	r2,[sp,#40]		@ A[1][0]
659#endif
660	eor	r0,r0,r11
661#ifndef	__thumb2__
662	ldr	r3,[sp,#40+4]
663#else
664	ldrd	r2,r3,[sp,#40]		@ A[1][0]
665#endif
666	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
667#ifndef	__thumb2__
668	ldr	r4,[sp,#88]		@ A[2][1]
669#endif
670	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
671#ifndef	__thumb2__
672	ldr	r5,[sp,#88+4]
673#else
674	ldrd	r4,r5,[sp,#88]		@ A[2][1]
675#endif
676
677	eor	r2,r2,r12
678#ifndef	__thumb2__
679	ldr	r10,[sp,#136]		@ A[3][2]
680#endif
681	eor	r3,r3,r14
682#ifndef	__thumb2__
683	ldr	r11,[sp,#136+4]
684#else
685	ldrd	r10,r11,[sp,#136]		@ A[3][2]
686#endif
687	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
688#ifndef	__thumb2__
689	ldr	r12,[sp,#224]		@ D[3]
690#endif
691	@ mov	r3,r3,ror#32-18
692#ifndef	__thumb2__
693	ldr	r14,[sp,#224+4]
694#else
695	ldrd	r12,r14,[sp,#224]		@ D[3]
696#endif
697
698	eor	r6,r6,r4
699	eor	r7,r7,r5
700	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
701	mov	r5,r7,ror#32-5
702
703	eor	r10,r10,r8
704#ifndef	__thumb2__
705	ldr	r8,[sp,#184]		@ A[4][3]
706#endif
707	eor	r11,r11,r9
708#ifndef	__thumb2__
709	ldr	r9,[sp,#184+4]
710#else
711	ldrd	r8,r9,[sp,#184]		@ A[4][3]
712#endif
713	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
714	mov	r6,r11,ror#32-8
715
716	eor	r12,r12,r8
717	eor	r14,r14,r9
718	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
719	mov	r9,r14,ror#32-28
720
721	bic	r10,r4,r2,ror#32-18
722	bic	r11,r5,r3,ror#32-18
723	eor	r10,r10,r0,ror#32-14
724	eor	r11,r11,r1,ror#32-13
725#ifndef	__thumb2__
726	str	r10,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
727#endif
728	bic	r12,r6,r4
729#ifndef	__thumb2__
730	str	r11,[sp,#360+4]
731#else
732	strd	r10,r11,[sp,#360]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
733#endif
734	bic	r14,r7,r5
735	eor	r12,r12,r2,ror#32-18
736#ifndef	__thumb2__
737	str	r12,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
738#endif
739	eor	r14,r14,r3,ror#32-18
740#ifndef	__thumb2__
741	str	r14,[sp,#368+4]
742#else
743	strd	r12,r14,[sp,#368]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
744#endif
745	bic	r10,r8,r6
746	bic	r11,r9,r7
747	bic	r12,r0,r8,ror#14
748	bic	r14,r1,r9,ror#13
749	eor	r10,r10,r4
750	eor	r11,r11,r5
751#ifndef	__thumb2__
752	str	r10,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
753#endif
754	bic	r2,r2,r0,ror#18-14
755#ifndef	__thumb2__
756	str	r11,[sp,#376+4]
757#else
758	strd	r10,r11,[sp,#376]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
759#endif
760	eor	r12,r6,r12,ror#32-14
761	bic	r11,r3,r1,ror#18-13
762	eor	r14,r7,r14,ror#32-13
763#ifndef	__thumb2__
764	str	r12,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
765#endif
766#ifndef	__thumb2__
767	str	r14,[sp,#384+4]
768#else
769	strd	r12,r14,[sp,#384]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
770#endif
771	add	r14,sp,#216
772#ifndef	__thumb2__
773	ldr	r0,[sp,#16]		@ A[0][2]
774#endif
775	eor	r10,r8,r2,ror#32-18
776#ifndef	__thumb2__
777	ldr	r1,[sp,#16+4]
778#else
779	ldrd	r0,r1,[sp,#16]		@ A[0][2]
780#endif
781	eor	r11,r9,r11,ror#32-18
782#ifndef	__thumb2__
783	str	r10,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
784#endif
785#ifndef	__thumb2__
786	str	r11,[sp,#392+4]
787#else
788	strd	r10,r11,[sp,#392]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
789#endif
790
791	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
792#ifndef	__thumb2__
793	ldr	r2,[sp,#64]		@ A[1][3]
794#endif
795#ifndef	__thumb2__
796	ldr	r3,[sp,#64+4]
797#else
798	ldrd	r2,r3,[sp,#64]		@ A[1][3]
799#endif
800#ifndef	__thumb2__
801	ldr	r6,[sp,#232]		@ D[4]
802#endif
803#ifndef	__thumb2__
804	ldr	r7,[sp,#232+4]
805#else
806	ldrd	r6,r7,[sp,#232]		@ D[4]
807#endif
808
809	eor	r0,r0,r10
810#ifndef	__thumb2__
811	ldr	r4,[sp,#112]		@ A[2][4]
812#endif
813	eor	r1,r1,r11
814#ifndef	__thumb2__
815	ldr	r5,[sp,#112+4]
816#else
817	ldrd	r4,r5,[sp,#112]		@ A[2][4]
818#endif
819	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
820#ifndef	__thumb2__
821	ldr	r8,[sp,#200]		@ D[0]
822#endif
823	@ mov	r1,r1,ror#32-31
824#ifndef	__thumb2__
825	ldr	r9,[sp,#200+4]
826#else
827	ldrd	r8,r9,[sp,#200]		@ D[0]
828#endif
829
830	eor	r12,r12,r2
831#ifndef	__thumb2__
832	ldr	r10,[sp,#120]		@ A[3][0]
833#endif
834	eor	r14,r14,r3
835#ifndef	__thumb2__
836	ldr	r11,[sp,#120+4]
837#else
838	ldrd	r10,r11,[sp,#120]		@ A[3][0]
839#endif
840	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
841#ifndef	__thumb2__
842	ldr	r12,[sp,#208]		@ D[1]
843#endif
844	mov	r2,r14,ror#32-28
845#ifndef	__thumb2__
846	ldr	r14,[sp,#208+4]
847#else
848	ldrd	r12,r14,[sp,#208]		@ D[1]
849#endif
850
851	eor	r6,r6,r4
852	eor	r7,r7,r5
853	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
854	mov	r4,r7,ror#32-20
855
856	eor	r10,r10,r8
857#ifndef	__thumb2__
858	ldr	r8,[sp,#168]		@ A[4][1]
859#endif
860	eor	r11,r11,r9
861#ifndef	__thumb2__
862	ldr	r9,[sp,#168+4]
863#else
864	ldrd	r8,r9,[sp,#168]		@ A[4][1]
865#endif
866	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
867	mov	r6,r11,ror#32-21
868
869	eor	r8,r8,r12
870	eor	r9,r9,r14
871	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
872	@ mov	r9,r3,ror#32-1
873
874	bic	r10,r4,r2
875	bic	r11,r5,r3
876	eor	r10,r10,r0,ror#32-31
877#ifndef	__thumb2__
878	str	r10,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
879#endif
880	eor	r11,r11,r1,ror#32-31
881#ifndef	__thumb2__
882	str	r11,[sp,#400+4]
883#else
884	strd	r10,r11,[sp,#400]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
885#endif
886	bic	r12,r6,r4
887	bic	r14,r7,r5
888	eor	r12,r12,r2
889	eor	r14,r14,r3
890#ifndef	__thumb2__
891	str	r12,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
892#endif
893	bic	r10,r8,r6,ror#1
894#ifndef	__thumb2__
895	str	r14,[sp,#408+4]
896#else
897	strd	r12,r14,[sp,#408]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
898#endif
899	bic	r11,r9,r7,ror#1
900	bic	r12,r0,r8,ror#31-1
901	bic	r14,r1,r9,ror#31-1
902	eor	r4,r4,r10,ror#32-1
903#ifndef	__thumb2__
904	str	r4,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
905#endif
906	eor	r5,r5,r11,ror#32-1
907#ifndef	__thumb2__
908	str	r5,[sp,#416+4]
909#else
910	strd	r4,r5,[sp,#416]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
911#endif
912	eor	r6,r6,r12,ror#32-31
913	eor	r7,r7,r14,ror#32-31
914#ifndef	__thumb2__
915	str	r6,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
916#endif
917	bic	r10,r2,r0,ror#32-31
918#ifndef	__thumb2__
919	str	r7,[sp,#424+4]
920#else
921	strd	r6,r7,[sp,#424]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
922#endif
923	bic	r11,r3,r1,ror#32-31
924	add	r12,sp,#240
925	eor	r8,r10,r8,ror#32-1
926	add	r10,sp,#280
927	eor	r9,r11,r9,ror#32-1
928#ifndef	__thumb2__
929	str	r8,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
930#endif
931#ifndef	__thumb2__
932	str	r9,[sp,#432+4]
933#else
934	strd	r8,r9,[sp,#432]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
935#endif
936	ldmia	r12,{r0,r1,r2,r3}		@ A[0][0..1]
937	ldmia	r10,{r10,r11,r12,r14}	@ A[1][0..1]
938#ifdef	__thumb2__
939	eor	r0,r0,r10
940	eor	r1,r1,r11
941	eor	r2,r2,r12
942	ldrd	r10,r11,[sp,#296]
943	eor	r3,r3,r14
944	ldrd	r12,r14,[sp,#304]
945	eor	r4,r4,r10
946	eor	r5,r5,r11
947	eor	r6,r6,r12
948	ldrd	r10,r11,[sp,#312]
949	eor	r7,r7,r14
950	ldrd	r12,r14,[sp,#320]
951	eor	r8,r8,r10
952	eor	r9,r9,r11
953	eor	r0,r0,r12
954	ldrd	r10,r11,[sp,#328]
955	eor	r1,r1,r14
956	ldrd	r12,r14,[sp,#336]
957	eor	r2,r2,r10
958	eor	r3,r3,r11
959	eor	r4,r4,r12
960	ldrd	r10,r11,[sp,#344]
961	eor	r5,r5,r14
962	ldrd	r12,r14,[sp,#352]
963	eor	r6,r6,r10
964	eor	r7,r7,r11
965	eor	r8,r8,r12
966	ldrd	r10,r11,[sp,#360]
967	eor	r9,r9,r14
968	ldrd	r12,r14,[sp,#368]
969	eor	r0,r0,r10
970	eor	r1,r1,r11
971	eor	r2,r2,r12
972	ldrd	r10,r11,[sp,#376]
973	eor	r3,r3,r14
974	ldrd	r12,r14,[sp,#384]
975	eor	r4,r4,r10
976	eor	r5,r5,r11
977	eor	r6,r6,r12
978	ldrd	r10,r11,[sp,#392]
979	eor	r7,r7,r14
980	ldrd	r12,r14,[sp,#400]
981	eor	r8,r8,r10
982	eor	r9,r9,r11
983	eor	r0,r0,r12
984	ldrd	r10,r11,[sp,#408]
985	eor	r1,r1,r14
986	ldrd	r12,r14,[sp,#256]
987	eor	r2,r2,r10
988	eor	r3,r3,r11
989	eor	r4,r4,r12
990	ldrd	r10,r11,[sp,#264]
991	eor	r5,r5,r14
992	ldrd	r12,r14,[sp,#272]
993#else
994	eor	r0,r0,r10
995	add	r10,sp,#296
996	eor	r1,r1,r11
997	eor	r2,r2,r12
998	eor	r3,r3,r14
999	ldmia	r10,{r10,r11,r12,r14}	@ A[1][2..3]
1000	eor	r4,r4,r10
1001	add	r10,sp,#312
1002	eor	r5,r5,r11
1003	eor	r6,r6,r12
1004	eor	r7,r7,r14
1005	ldmia	r10,{r10,r11,r12,r14}	@ A[1][4]..A[2][0]
1006	eor	r8,r8,r10
1007	add	r10,sp,#328
1008	eor	r9,r9,r11
1009	eor	r0,r0,r12
1010	eor	r1,r1,r14
1011	ldmia	r10,{r10,r11,r12,r14}	@ A[2][1..2]
1012	eor	r2,r2,r10
1013	add	r10,sp,#344
1014	eor	r3,r3,r11
1015	eor	r4,r4,r12
1016	eor	r5,r5,r14
1017	ldmia	r10,{r10,r11,r12,r14}	@ A[2][3..4]
1018	eor	r6,r6,r10
1019	add	r10,sp,#360
1020	eor	r7,r7,r11
1021	eor	r8,r8,r12
1022	eor	r9,r9,r14
1023	ldmia	r10,{r10,r11,r12,r14}	@ A[3][0..1]
1024	eor	r0,r0,r10
1025	add	r10,sp,#376
1026	eor	r1,r1,r11
1027	eor	r2,r2,r12
1028	eor	r3,r3,r14
1029	ldmia	r10,{r10,r11,r12,r14}	@ A[3][2..3]
1030	eor	r4,r4,r10
1031	add	r10,sp,#392
1032	eor	r5,r5,r11
1033	eor	r6,r6,r12
1034	eor	r7,r7,r14
1035	ldmia	r10,{r10,r11,r12,r14}	@ A[3][4]..A[4][0]
1036	eor	r8,r8,r10
1037	ldr	r10,[sp,#408]		@ A[4][1]
1038	eor	r9,r9,r11
1039	ldr	r11,[sp,#408+4]
1040	eor	r0,r0,r12
1041	ldr	r12,[sp,#256]		@ A[0][2]
1042	eor	r1,r1,r14
1043	ldr	r14,[sp,#256+4]
1044	eor	r2,r2,r10
1045	add	r10,sp,#264
1046	eor	r3,r3,r11
1047	eor	r4,r4,r12
1048	eor	r5,r5,r14
1049	ldmia	r10,{r10,r11,r12,r14}	@ A[0][3..4]
1050#endif
1051	eor	r6,r6,r10
1052	eor	r7,r7,r11
1053	eor	r8,r8,r12
1054	eor	r9,r9,r14
1055
1056	eor	r10,r0,r5,ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
1057#ifndef	__thumb2__
1058	str	r10,[sp,#208]		@ D[1] = E[0]
1059#endif
1060	eor	r11,r1,r4
1061#ifndef	__thumb2__
1062	str	r11,[sp,#208+4]
1063#else
1064	strd	r10,r11,[sp,#208]		@ D[1] = E[0]
1065#endif
1066	eor	r12,r6,r1,ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
1067	eor	r14,r7,r0
1068#ifndef	__thumb2__
1069	str	r12,[sp,#232]		@ D[4] = E[1]
1070#endif
1071	eor	r0,r8,r3,ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
1072#ifndef	__thumb2__
1073	str	r14,[sp,#232+4]
1074#else
1075	strd	r12,r14,[sp,#232]		@ D[4] = E[1]
1076#endif
1077	eor	r1,r9,r2
1078#ifndef	__thumb2__
1079	str	r0,[sp,#200]		@ D[0] = C[0]
1080#endif
1081	eor	r2,r2,r7,ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
1082#ifndef	__thumb2__
1083	ldr	r7,[sp,#384]
1084#endif
1085	eor	r3,r3,r6
1086#ifndef	__thumb2__
1087	str	r1,[sp,#200+4]
1088#else
1089	strd	r0,r1,[sp,#200]		@ D[0] = C[0]
1090#endif
1091#ifndef	__thumb2__
1092	ldr	r6,[sp,#384+4]
1093#else
1094	ldrd	r7,r6,[sp,#384]
1095#endif
1096#ifndef	__thumb2__
1097	str	r2,[sp,#216]		@ D[2] = C[1]
1098#endif
1099	eor	r4,r4,r9,ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
1100#ifndef	__thumb2__
1101	str	r3,[sp,#216+4]
1102#else
1103	strd	r2,r3,[sp,#216]		@ D[2] = C[1]
1104#endif
1105	eor	r5,r5,r8
1106
1107#ifndef	__thumb2__
1108	ldr	r8,[sp,#432]
1109#endif
1110#ifndef	__thumb2__
1111	ldr	r9,[sp,#432+4]
1112#else
1113	ldrd	r8,r9,[sp,#432]
1114#endif
1115#ifndef	__thumb2__
1116	str	r4,[sp,#224]		@ D[3] = C[2]
1117#endif
1118	eor	r7,r7,r4
1119#ifndef	__thumb2__
1120	str	r5,[sp,#224+4]
1121#else
1122	strd	r4,r5,[sp,#224]		@ D[3] = C[2]
1123#endif
1124	eor	r6,r6,r5
1125#ifndef	__thumb2__
1126	ldr	r4,[sp,#240]
1127#endif
1128	@ mov	r7,r7,ror#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
1129	@ mov	r6,r6,ror#32-11
1130#ifndef	__thumb2__
1131	ldr	r5,[sp,#240+4]
1132#else
1133	ldrd	r4,r5,[sp,#240]
1134#endif
1135	eor	r8,r8,r12
1136	eor	r9,r9,r14
1137#ifndef	__thumb2__
1138	ldr	r12,[sp,#336]
1139#endif
1140	eor	r0,r0,r4
1141#ifndef	__thumb2__
1142	ldr	r14,[sp,#336+4]
1143#else
1144	ldrd	r12,r14,[sp,#336]
1145#endif
1146	@ mov	r8,r8,ror#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
1147	@ mov	r9,r9,ror#32-7
1148	eor	r1,r1,r5		@ C[0] =       A[0][0] ^ C[0];
1149	eor	r12,r12,r2
1150#ifndef	__thumb2__
1151	ldr	r2,[sp,#288]
1152#endif
1153	eor	r14,r14,r3
1154#ifndef	__thumb2__
1155	ldr	r3,[sp,#288+4]
1156#else
1157	ldrd	r2,r3,[sp,#288]
1158#endif
1159	mov	r5,r12,ror#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);
1160	ldr	r12,[sp,#444]			@ load counter
1161	eor	r2,r2,r10
1162	adr	r10,iotas32
1163	mov	r4,r14,ror#32-22
1164	add	r14,r10,r12
1165	eor	r3,r3,r11
1166#ifndef	__thumb2__
1167	ldr	r10,[r14,#8]		@ iotas[i].lo
1168#endif
1169	add	r12,r12,#16
1170#ifndef	__thumb2__
1171	ldr	r11,[r14,#12]		@ iotas[i].hi
1172#else
1173	ldrd	r10,r11,[r14,#8]		@ iotas[i].lo
1174#endif
1175	cmp	r12,#192
1176	str	r12,[sp,#444]			@ store counter
1177	bic	r12,r4,r2,ror#32-22
1178	bic	r14,r5,r3,ror#32-22
1179	mov	r2,r2,ror#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);
1180	mov	r3,r3,ror#32-22
1181	eor	r12,r12,r0
1182	eor	r14,r14,r1
1183	eor	r10,r10,r12
1184	eor	r11,r11,r14
1185#ifndef	__thumb2__
1186	str	r10,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1187#endif
1188	bic	r12,r6,r4,ror#11
1189#ifndef	__thumb2__
1190	str	r11,[sp,#0+4]
1191#else
1192	strd	r10,r11,[sp,#0]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
1193#endif
1194	bic	r14,r7,r5,ror#10
1195	bic	r10,r8,r6,ror#32-(11-7)
1196	bic	r11,r9,r7,ror#32-(10-7)
1197	eor	r12,r2,r12,ror#32-11
1198#ifndef	__thumb2__
1199	str	r12,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1200#endif
1201	eor	r14,r3,r14,ror#32-10
1202#ifndef	__thumb2__
1203	str	r14,[sp,#8+4]
1204#else
1205	strd	r12,r14,[sp,#8]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
1206#endif
1207	eor	r10,r4,r10,ror#32-7
1208	eor	r11,r5,r11,ror#32-7
1209#ifndef	__thumb2__
1210	str	r10,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1211#endif
1212	bic	r12,r0,r8,ror#32-7
1213#ifndef	__thumb2__
1214	str	r11,[sp,#16+4]
1215#else
1216	strd	r10,r11,[sp,#16]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
1217#endif
1218	bic	r14,r1,r9,ror#32-7
1219	eor	r12,r12,r6,ror#32-11
1220#ifndef	__thumb2__
1221	str	r12,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1222#endif
1223	eor	r14,r14,r7,ror#32-10
1224#ifndef	__thumb2__
1225	str	r14,[sp,#24+4]
1226#else
1227	strd	r12,r14,[sp,#24]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
1228#endif
1229	bic	r10,r2,r0
1230	add	r14,sp,#224
1231#ifndef	__thumb2__
1232	ldr	r0,[sp,#264]		@ A[0][3]
1233#endif
1234	bic	r11,r3,r1
1235#ifndef	__thumb2__
1236	ldr	r1,[sp,#264+4]
1237#else
1238	ldrd	r0,r1,[sp,#264]		@ A[0][3]
1239#endif
1240	eor	r10,r10,r8,ror#32-7
1241	eor	r11,r11,r9,ror#32-7
1242#ifndef	__thumb2__
1243	str	r10,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1244#endif
1245	add	r9,sp,#200
1246#ifndef	__thumb2__
1247	str	r11,[sp,#32+4]
1248#else
1249	strd	r10,r11,[sp,#32]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
1250#endif
1251
1252	ldmia	r14,{r10,r11,r12,r14}	@ D[3..4]
1253	ldmia	r9,{r6,r7,r8,r9}		@ D[0..1]
1254
1255#ifndef	__thumb2__
1256	ldr	r2,[sp,#312]		@ A[1][4]
1257#endif
1258	eor	r0,r0,r10
1259#ifndef	__thumb2__
1260	ldr	r3,[sp,#312+4]
1261#else
1262	ldrd	r2,r3,[sp,#312]		@ A[1][4]
1263#endif
1264	eor	r1,r1,r11
1265	@ mov	r0,r0,ror#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
1266#ifndef	__thumb2__
1267	ldr	r10,[sp,#368]		@ A[3][1]
1268#endif
1269	@ mov	r1,r1,ror#32-14
1270#ifndef	__thumb2__
1271	ldr	r11,[sp,#368+4]
1272#else
1273	ldrd	r10,r11,[sp,#368]		@ A[3][1]
1274#endif
1275
1276	eor	r2,r2,r12
1277#ifndef	__thumb2__
1278	ldr	r4,[sp,#320]		@ A[2][0]
1279#endif
1280	eor	r3,r3,r14
1281#ifndef	__thumb2__
1282	ldr	r5,[sp,#320+4]
1283#else
1284	ldrd	r4,r5,[sp,#320]		@ A[2][0]
1285#endif
1286	@ mov	r2,r2,ror#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
1287	@ mov	r3,r3,ror#32-10
1288
1289	eor	r6,r6,r4
1290#ifndef	__thumb2__
1291	ldr	r12,[sp,#216]		@ D[2]
1292#endif
1293	eor	r7,r7,r5
1294#ifndef	__thumb2__
1295	ldr	r14,[sp,#216+4]
1296#else
1297	ldrd	r12,r14,[sp,#216]		@ D[2]
1298#endif
1299	mov	r5,r6,ror#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
1300	mov	r4,r7,ror#32-2
1301
1302	eor	r10,r10,r8
1303#ifndef	__thumb2__
1304	ldr	r8,[sp,#416]		@ A[4][2]
1305#endif
1306	eor	r11,r11,r9
1307#ifndef	__thumb2__
1308	ldr	r9,[sp,#416+4]
1309#else
1310	ldrd	r8,r9,[sp,#416]		@ A[4][2]
1311#endif
1312	mov	r7,r10,ror#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
1313	mov	r6,r11,ror#32-23
1314
1315	bic	r10,r4,r2,ror#32-10
1316	bic	r11,r5,r3,ror#32-10
1317	eor	r12,r12,r8
1318	eor	r14,r14,r9
1319	mov	r9,r12,ror#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
1320	mov	r8,r14,ror#32-31
1321	eor	r10,r10,r0,ror#32-14
1322	eor	r11,r11,r1,ror#32-14
1323#ifndef	__thumb2__
1324	str	r10,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1325#endif
1326	bic	r12,r6,r4
1327#ifndef	__thumb2__
1328	str	r11,[sp,#40+4]
1329#else
1330	strd	r10,r11,[sp,#40]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
1331#endif
1332	bic	r14,r7,r5
1333	eor	r12,r12,r2,ror#32-10
1334#ifndef	__thumb2__
1335	str	r12,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1336#endif
1337	eor	r14,r14,r3,ror#32-10
1338#ifndef	__thumb2__
1339	str	r14,[sp,#48+4]
1340#else
1341	strd	r12,r14,[sp,#48]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
1342#endif
1343	bic	r10,r8,r6
1344	bic	r11,r9,r7
1345	bic	r12,r0,r8,ror#14
1346	bic	r14,r1,r9,ror#14
1347	eor	r10,r10,r4
1348	eor	r11,r11,r5
1349#ifndef	__thumb2__
1350	str	r10,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1351#endif
1352	bic	r2,r2,r0,ror#32-(14-10)
1353#ifndef	__thumb2__
1354	str	r11,[sp,#56+4]
1355#else
1356	strd	r10,r11,[sp,#56]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
1357#endif
1358	eor	r12,r6,r12,ror#32-14
1359	bic	r11,r3,r1,ror#32-(14-10)
1360#ifndef	__thumb2__
1361	str	r12,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1362#endif
1363	eor	r14,r7,r14,ror#32-14
1364#ifndef	__thumb2__
1365	str	r14,[sp,#64+4]
1366#else
1367	strd	r12,r14,[sp,#64]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
1368#endif
1369	add	r12,sp,#208
1370#ifndef	__thumb2__
1371	ldr	r1,[sp,#248]		@ A[0][1]
1372#endif
1373	eor	r10,r8,r2,ror#32-10
1374#ifndef	__thumb2__
1375	ldr	r0,[sp,#248+4]
1376#else
1377	ldrd	r1,r0,[sp,#248]		@ A[0][1]
1378#endif
1379	eor	r11,r9,r11,ror#32-10
1380#ifndef	__thumb2__
1381	str	r10,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1382#endif
1383#ifndef	__thumb2__
1384	str	r11,[sp,#72+4]
1385#else
1386	strd	r10,r11,[sp,#72]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
1387#endif
1388
1389	add	r9,sp,#224
1390	ldmia	r12,{r10,r11,r12,r14}	@ D[1..2]
1391#ifndef	__thumb2__
1392	ldr	r2,[sp,#296]		@ A[1][2]
1393#endif
1394#ifndef	__thumb2__
1395	ldr	r3,[sp,#296+4]
1396#else
1397	ldrd	r2,r3,[sp,#296]		@ A[1][2]
1398#endif
1399	ldmia	r9,{r6,r7,r8,r9}		@ D[3..4]
1400
1401	eor	r1,r1,r10
1402#ifndef	__thumb2__
1403	ldr	r4,[sp,#344]		@ A[2][3]
1404#endif
1405	eor	r0,r0,r11
1406#ifndef	__thumb2__
1407	ldr	r5,[sp,#344+4]
1408#else
1409	ldrd	r4,r5,[sp,#344]		@ A[2][3]
1410#endif
1411	mov	r0,r0,ror#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
1412
1413	eor	r2,r2,r12
1414#ifndef	__thumb2__
1415	ldr	r10,[sp,#392]		@ A[3][4]
1416#endif
1417	eor	r3,r3,r14
1418#ifndef	__thumb2__
1419	ldr	r11,[sp,#392+4]
1420#else
1421	ldrd	r10,r11,[sp,#392]		@ A[3][4]
1422#endif
1423	@ mov	r2,r2,ror#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
1424#ifndef	__thumb2__
1425	ldr	r12,[sp,#200]		@ D[0]
1426#endif
1427	@ mov	r3,r3,ror#32-3
1428#ifndef	__thumb2__
1429	ldr	r14,[sp,#200+4]
1430#else
1431	ldrd	r12,r14,[sp,#200]		@ D[0]
1432#endif
1433
1434	eor	r4,r4,r6
1435	eor	r5,r5,r7
1436	@ mov	r5,r6,ror#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
1437	@ mov	r4,r7,ror#32-13		@ [track reverse order below]
1438
1439	eor	r10,r10,r8
1440#ifndef	__thumb2__
1441	ldr	r8,[sp,#400]		@ A[4][0]
1442#endif
1443	eor	r11,r11,r9
1444#ifndef	__thumb2__
1445	ldr	r9,[sp,#400+4]
1446#else
1447	ldrd	r8,r9,[sp,#400]		@ A[4][0]
1448#endif
1449	mov	r6,r10,ror#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
1450	mov	r7,r11,ror#32-4
1451
1452	eor	r12,r12,r8
1453	eor	r14,r14,r9
1454	mov	r8,r12,ror#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
1455	mov	r9,r14,ror#32-9
1456
1457	bic	r10,r5,r2,ror#13-3
1458	bic	r11,r4,r3,ror#12-3
1459	bic	r12,r6,r5,ror#32-13
1460	bic	r14,r7,r4,ror#32-12
1461	eor	r10,r0,r10,ror#32-13
1462	eor	r11,r1,r11,ror#32-12
1463#ifndef	__thumb2__
1464	str	r10,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1465#endif
1466	eor	r12,r12,r2,ror#32-3
1467#ifndef	__thumb2__
1468	str	r11,[sp,#80+4]
1469#else
1470	strd	r10,r11,[sp,#80]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
1471#endif
1472	eor	r14,r14,r3,ror#32-3
1473#ifndef	__thumb2__
1474	str	r12,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1475#endif
1476	bic	r10,r8,r6
1477	bic	r11,r9,r7
1478#ifndef	__thumb2__
1479	str	r14,[sp,#88+4]
1480#else
1481	strd	r12,r14,[sp,#88]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
1482#endif
1483	eor	r10,r10,r5,ror#32-13
1484	eor	r11,r11,r4,ror#32-12
1485#ifndef	__thumb2__
1486	str	r10,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1487#endif
1488	bic	r12,r0,r8
1489#ifndef	__thumb2__
1490	str	r11,[sp,#96+4]
1491#else
1492	strd	r10,r11,[sp,#96]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
1493#endif
1494	bic	r14,r1,r9
1495	eor	r12,r12,r6
1496	eor	r14,r14,r7
1497#ifndef	__thumb2__
1498	str	r12,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1499#endif
1500	bic	r10,r2,r0,ror#3
1501#ifndef	__thumb2__
1502	str	r14,[sp,#104+4]
1503#else
1504	strd	r12,r14,[sp,#104]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
1505#endif
1506	bic	r11,r3,r1,ror#3
1507#ifndef	__thumb2__
1508	ldr	r1,[sp,#272]		@ A[0][4] [in reverse order]
1509#endif
1510	eor	r10,r8,r10,ror#32-3
1511#ifndef	__thumb2__
1512	ldr	r0,[sp,#272+4]
1513#else
1514	ldrd	r1,r0,[sp,#272]		@ A[0][4] [in reverse order]
1515#endif
1516	eor	r11,r9,r11,ror#32-3
1517#ifndef	__thumb2__
1518	str	r10,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1519#endif
1520	add	r9,sp,#208
1521#ifndef	__thumb2__
1522	str	r11,[sp,#112+4]
1523#else
1524	strd	r10,r11,[sp,#112]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
1525#endif
1526
1527#ifndef	__thumb2__
1528	ldr	r10,[sp,#232]		@ D[4]
1529#endif
1530#ifndef	__thumb2__
1531	ldr	r11,[sp,#232+4]
1532#else
1533	ldrd	r10,r11,[sp,#232]		@ D[4]
1534#endif
1535#ifndef	__thumb2__
1536	ldr	r12,[sp,#200]		@ D[0]
1537#endif
1538#ifndef	__thumb2__
1539	ldr	r14,[sp,#200+4]
1540#else
1541	ldrd	r12,r14,[sp,#200]		@ D[0]
1542#endif
1543
1544	ldmia	r9,{r6,r7,r8,r9}		@ D[1..2]
1545
1546	eor	r1,r1,r10
1547#ifndef	__thumb2__
1548	ldr	r2,[sp,#280]		@ A[1][0]
1549#endif
1550	eor	r0,r0,r11
1551#ifndef	__thumb2__
1552	ldr	r3,[sp,#280+4]
1553#else
1554	ldrd	r2,r3,[sp,#280]		@ A[1][0]
1555#endif
1556	@ mov	r1,r10,ror#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
1557#ifndef	__thumb2__
1558	ldr	r4,[sp,#328]		@ A[2][1]
1559#endif
1560	@ mov	r0,r11,ror#32-14		@ [was loaded in reverse order]
1561#ifndef	__thumb2__
1562	ldr	r5,[sp,#328+4]
1563#else
1564	ldrd	r4,r5,[sp,#328]		@ A[2][1]
1565#endif
1566
1567	eor	r2,r2,r12
1568#ifndef	__thumb2__
1569	ldr	r10,[sp,#376]		@ A[3][2]
1570#endif
1571	eor	r3,r3,r14
1572#ifndef	__thumb2__
1573	ldr	r11,[sp,#376+4]
1574#else
1575	ldrd	r10,r11,[sp,#376]		@ A[3][2]
1576#endif
1577	@ mov	r2,r2,ror#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
1578#ifndef	__thumb2__
1579	ldr	r12,[sp,#224]		@ D[3]
1580#endif
1581	@ mov	r3,r3,ror#32-18
1582#ifndef	__thumb2__
1583	ldr	r14,[sp,#224+4]
1584#else
1585	ldrd	r12,r14,[sp,#224]		@ D[3]
1586#endif
1587
1588	eor	r6,r6,r4
1589	eor	r7,r7,r5
1590	mov	r4,r6,ror#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
1591	mov	r5,r7,ror#32-5
1592
1593	eor	r10,r10,r8
1594#ifndef	__thumb2__
1595	ldr	r8,[sp,#424]		@ A[4][3]
1596#endif
1597	eor	r11,r11,r9
1598#ifndef	__thumb2__
1599	ldr	r9,[sp,#424+4]
1600#else
1601	ldrd	r8,r9,[sp,#424]		@ A[4][3]
1602#endif
1603	mov	r7,r10,ror#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
1604	mov	r6,r11,ror#32-8
1605
1606	eor	r12,r12,r8
1607	eor	r14,r14,r9
1608	mov	r8,r12,ror#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
1609	mov	r9,r14,ror#32-28
1610
1611	bic	r10,r4,r2,ror#32-18
1612	bic	r11,r5,r3,ror#32-18
1613	eor	r10,r10,r0,ror#32-14
1614	eor	r11,r11,r1,ror#32-13
1615#ifndef	__thumb2__
1616	str	r10,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1617#endif
1618	bic	r12,r6,r4
1619#ifndef	__thumb2__
1620	str	r11,[sp,#120+4]
1621#else
1622	strd	r10,r11,[sp,#120]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
1623#endif
1624	bic	r14,r7,r5
1625	eor	r12,r12,r2,ror#32-18
1626#ifndef	__thumb2__
1627	str	r12,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1628#endif
1629	eor	r14,r14,r3,ror#32-18
1630#ifndef	__thumb2__
1631	str	r14,[sp,#128+4]
1632#else
1633	strd	r12,r14,[sp,#128]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
1634#endif
1635	bic	r10,r8,r6
1636	bic	r11,r9,r7
1637	bic	r12,r0,r8,ror#14
1638	bic	r14,r1,r9,ror#13
1639	eor	r10,r10,r4
1640	eor	r11,r11,r5
1641#ifndef	__thumb2__
1642	str	r10,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1643#endif
1644	bic	r2,r2,r0,ror#18-14
1645#ifndef	__thumb2__
1646	str	r11,[sp,#136+4]
1647#else
1648	strd	r10,r11,[sp,#136]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
1649#endif
1650	eor	r12,r6,r12,ror#32-14
1651	bic	r11,r3,r1,ror#18-13
1652	eor	r14,r7,r14,ror#32-13
1653#ifndef	__thumb2__
1654	str	r12,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1655#endif
1656#ifndef	__thumb2__
1657	str	r14,[sp,#144+4]
1658#else
1659	strd	r12,r14,[sp,#144]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
1660#endif
1661	add	r14,sp,#216
1662#ifndef	__thumb2__
1663	ldr	r0,[sp,#256]		@ A[0][2]
1664#endif
1665	eor	r10,r8,r2,ror#32-18
1666#ifndef	__thumb2__
1667	ldr	r1,[sp,#256+4]
1668#else
1669	ldrd	r0,r1,[sp,#256]		@ A[0][2]
1670#endif
1671	eor	r11,r9,r11,ror#32-18
1672#ifndef	__thumb2__
1673	str	r10,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1674#endif
1675#ifndef	__thumb2__
1676	str	r11,[sp,#152+4]
1677#else
1678	strd	r10,r11,[sp,#152]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
1679#endif
1680
1681	ldmia	r14,{r10,r11,r12,r14}	@ D[2..3]
1682#ifndef	__thumb2__
1683	ldr	r2,[sp,#304]		@ A[1][3]
1684#endif
1685#ifndef	__thumb2__
1686	ldr	r3,[sp,#304+4]
1687#else
1688	ldrd	r2,r3,[sp,#304]		@ A[1][3]
1689#endif
1690#ifndef	__thumb2__
1691	ldr	r6,[sp,#232]		@ D[4]
1692#endif
1693#ifndef	__thumb2__
1694	ldr	r7,[sp,#232+4]
1695#else
1696	ldrd	r6,r7,[sp,#232]		@ D[4]
1697#endif
1698
1699	eor	r0,r0,r10
1700#ifndef	__thumb2__
1701	ldr	r4,[sp,#352]		@ A[2][4]
1702#endif
1703	eor	r1,r1,r11
1704#ifndef	__thumb2__
1705	ldr	r5,[sp,#352+4]
1706#else
1707	ldrd	r4,r5,[sp,#352]		@ A[2][4]
1708#endif
1709	@ mov	r0,r0,ror#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
1710#ifndef	__thumb2__
1711	ldr	r8,[sp,#200]		@ D[0]
1712#endif
1713	@ mov	r1,r1,ror#32-31
1714#ifndef	__thumb2__
1715	ldr	r9,[sp,#200+4]
1716#else
1717	ldrd	r8,r9,[sp,#200]		@ D[0]
1718#endif
1719
1720	eor	r12,r12,r2
1721#ifndef	__thumb2__
1722	ldr	r10,[sp,#360]		@ A[3][0]
1723#endif
1724	eor	r14,r14,r3
1725#ifndef	__thumb2__
1726	ldr	r11,[sp,#360+4]
1727#else
1728	ldrd	r10,r11,[sp,#360]		@ A[3][0]
1729#endif
1730	mov	r3,r12,ror#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
1731#ifndef	__thumb2__
1732	ldr	r12,[sp,#208]		@ D[1]
1733#endif
1734	mov	r2,r14,ror#32-28
1735#ifndef	__thumb2__
1736	ldr	r14,[sp,#208+4]
1737#else
1738	ldrd	r12,r14,[sp,#208]		@ D[1]
1739#endif
1740
1741	eor	r6,r6,r4
1742	eor	r7,r7,r5
1743	mov	r5,r6,ror#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
1744	mov	r4,r7,ror#32-20
1745
1746	eor	r10,r10,r8
1747#ifndef	__thumb2__
1748	ldr	r8,[sp,#408]		@ A[4][1]
1749#endif
1750	eor	r11,r11,r9
1751#ifndef	__thumb2__
1752	ldr	r9,[sp,#408+4]
1753#else
1754	ldrd	r8,r9,[sp,#408]		@ A[4][1]
1755#endif
1756	mov	r7,r10,ror#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
1757	mov	r6,r11,ror#32-21
1758
1759	eor	r8,r8,r12
1760	eor	r9,r9,r14
1761	@ mov	r8,r2,ror#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
1762	@ mov	r9,r3,ror#32-1
1763
1764	bic	r10,r4,r2
1765	bic	r11,r5,r3
1766	eor	r10,r10,r0,ror#32-31
1767#ifndef	__thumb2__
1768	str	r10,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1769#endif
1770	eor	r11,r11,r1,ror#32-31
1771#ifndef	__thumb2__
1772	str	r11,[sp,#160+4]
1773#else
1774	strd	r10,r11,[sp,#160]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
1775#endif
1776	bic	r12,r6,r4
1777	bic	r14,r7,r5
1778	eor	r12,r12,r2
1779	eor	r14,r14,r3
1780#ifndef	__thumb2__
1781	str	r12,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1782#endif
1783	bic	r10,r8,r6,ror#1
1784#ifndef	__thumb2__
1785	str	r14,[sp,#168+4]
1786#else
1787	strd	r12,r14,[sp,#168]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
1788#endif
1789	bic	r11,r9,r7,ror#1
1790	bic	r12,r0,r8,ror#31-1
1791	bic	r14,r1,r9,ror#31-1
1792	eor	r4,r4,r10,ror#32-1
1793#ifndef	__thumb2__
1794	str	r4,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1795#endif
1796	eor	r5,r5,r11,ror#32-1
1797#ifndef	__thumb2__
1798	str	r5,[sp,#176+4]
1799#else
1800	strd	r4,r5,[sp,#176]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
1801#endif
1802	eor	r6,r6,r12,ror#32-31
1803	eor	r7,r7,r14,ror#32-31
1804#ifndef	__thumb2__
1805	str	r6,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1806#endif
1807	bic	r10,r2,r0,ror#32-31
1808#ifndef	__thumb2__
1809	str	r7,[sp,#184+4]
1810#else
1811	strd	r6,r7,[sp,#184]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
1812#endif
1813	bic	r11,r3,r1,ror#32-31
1814	add	r12,sp,#0
1815	eor	r8,r10,r8,ror#32-1
1816	add	r10,sp,#40
1817	eor	r9,r11,r9,ror#32-1
1818#ifndef	__thumb2__
1819	str	r8,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1820#endif
1821#ifndef	__thumb2__
1822	str	r9,[sp,#192+4]
1823#else
1824	strd	r8,r9,[sp,#192]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
1825#endif
1826	blo	.Lround2x
1827
1828#if __ARM_ARCH__>=5
1829	ldr	pc,[sp,#440]
1830#else
1831	ldr	lr,[sp,#440]
1832	tst	lr,#1
1833	moveq	pc,lr		@ be binary compatible with V4, yet
1834.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
1835#endif
1836.size	KeccakF1600_int,.-KeccakF1600_int
1837
1838.type	KeccakF1600, %function
1839.align	5
1840KeccakF1600:
1841	stmdb	sp!,{r0,r4-r11,lr}
1842	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
1843
1844	add	r10,r0,#40
1845	add	r11,sp,#40
1846	ldmia	r0,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ copy A[5][5] to stack
1847	stmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1848	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1849	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1850	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1851	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1852	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1853	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1854	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1855	add	r12,sp,#0
1856	add	r10,sp,#40
1857	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1858
1859	bl	KeccakF1600_enter
1860
1861	ldr	r11, [sp,#440+16]		@ restore pointer to A
1862	ldmia	sp,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1863	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ return A[5][5]
1864	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1865	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1866	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1867	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1868	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1869	stmia	r11!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1870	ldmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1871	stmia	r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1872
1873	add	sp,sp,#440+20
1874#if __ARM_ARCH__>=5
1875	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
1876#else
1877	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
1878	tst	lr,#1
1879	moveq	pc,lr		@ be binary compatible with V4, yet
1880.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
1881#endif
1882.size	KeccakF1600,.-KeccakF1600
1883.globl	SHA3_absorb
1884.type	SHA3_absorb,%function
1885.align	5
1886SHA3_absorb:
1887	stmdb	sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1888	sub	sp,sp,#456+16
1889
1890	add	r10,r0,#40
1891	@ mov	r11,r1
1892	mov	r12,r2
1893	mov	r14,r3
1894	cmp	r2,r3
1895	blo	.Labsorb_abort
1896
1897	add	r11,sp,#0
1898	ldmia	r0,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ copy A[5][5] to stack
1899	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1900	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1901	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1902	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1903	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1904	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1905	stmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1906	ldmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1907	stmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
1908
1909	ldr	r11,[sp,#476]		@ restore r11
1910#ifdef	__thumb2__
1911	mov	r9,#0x00ff00ff
1912	mov	r8,#0x0f0f0f0f
1913	mov	r7,#0x33333333
1914	mov	r6,#0x55555555
1915#else
1916	mov	r6,#0x11		@ compose constants
1917	mov	r8,#0x0f
1918	mov	r9,#0xff
1919	orr	r6,r6,r6,lsl#8
1920	orr	r8,r8,r8,lsl#8
1921	orr	r6,r6,r6,lsl#16		@ 0x11111111
1922	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
1923	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
1924	orr	r7,r6,r6,lsl#1		@ 0x33333333
1925	orr	r6,r6,r6,lsl#2		@ 0x55555555
1926#endif
1927	str	r9,[sp,#468]
1928	str	r8,[sp,#464]
1929	str	r7,[sp,#460]
1930	str	r6,[sp,#456]
1931	b	.Loop_absorb
1932
1933.align	4
1934.Loop_absorb:
1935	subs	r0,r12,r14
1936	blo	.Labsorbed
1937	add	r10,sp,#0
1938	str	r0,[sp,#480]		@ save len - bsz
1939
1940.align	4
1941.Loop_block:
1942	ldrb	r0,[r11],#1
1943	ldrb	r1,[r11],#1
1944	ldrb	r2,[r11],#1
1945	ldrb	r3,[r11],#1
1946	ldrb	r4,[r11],#1
1947	orr	r0,r0,r1,lsl#8
1948	ldrb	r1,[r11],#1
1949	orr	r0,r0,r2,lsl#16
1950	ldrb	r2,[r11],#1
1951	orr	r0,r0,r3,lsl#24		@ lo
1952	ldrb	r3,[r11],#1
1953	orr	r1,r4,r1,lsl#8
1954	orr	r1,r1,r2,lsl#16
1955	orr	r1,r1,r3,lsl#24		@ hi
1956
1957	and	r2,r0,r6		@ &=0x55555555
1958	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
1959	and	r3,r1,r6		@ &=0x55555555
1960	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1961	orr	r2,r2,r2,lsr#1
1962	orr	r0,r0,r0,lsl#1
1963	orr	r3,r3,r3,lsr#1
1964	orr	r1,r1,r1,lsl#1
1965	and	r2,r2,r7		@ &=0x33333333
1966	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
1967	and	r3,r3,r7		@ &=0x33333333
1968	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1969	orr	r2,r2,r2,lsr#2
1970	orr	r0,r0,r0,lsl#2
1971	orr	r3,r3,r3,lsr#2
1972	orr	r1,r1,r1,lsl#2
1973	and	r2,r2,r8		@ &=0x0f0f0f0f
1974	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
1975	and	r3,r3,r8		@ &=0x0f0f0f0f
1976	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
1977	ldmia	r10,{r4,r5}		@ A_flat[i]
1978	orr	r2,r2,r2,lsr#4
1979	orr	r0,r0,r0,lsl#4
1980	orr	r3,r3,r3,lsr#4
1981	orr	r1,r1,r1,lsl#4
1982	and	r2,r2,r9		@ &=0x00ff00ff
1983	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
1984	and	r3,r3,r9		@ &=0x00ff00ff
1985	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
1986	orr	r2,r2,r2,lsr#8
1987	orr	r0,r0,r0,lsl#8
1988	orr	r3,r3,r3,lsr#8
1989	orr	r1,r1,r1,lsl#8
1990
1991	mov	r2,r2,lsl#16
1992	mov	r1,r1,lsr#16
1993	eor	r4,r4,r3,lsl#16
1994	eor	r5,r5,r0,lsr#16
1995	eor	r4,r4,r2,lsr#16
1996	eor	r5,r5,r1,lsl#16
1997	stmia	r10!,{r4,r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
1998
1999	subs	r14,r14,#8
2000	bhi	.Loop_block
2001
2002	str	r11,[sp,#476]
2003
2004	bl	KeccakF1600_int
2005
2006	add	r14,sp,#456
2007	ldmia	r14,{r6,r7,r8,r9,r10,r11,r12,r14}	@ restore constants and variables
2008	b	.Loop_absorb
2009
2010.align	4
2011.Labsorbed:
2012	add	r11,sp,#40
2013	ldmia	sp,      {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2014	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}	@ return A[5][5]
2015	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2016	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2017	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2018	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2019	ldmia	r11!,   {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2020	stmia	r10!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2021	ldmia	r11,    {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2022	stmia	r10, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
2023
2024.Labsorb_abort:
2025	add	sp,sp,#456+32
2026	mov	r0,r12			@ return value
2027#if __ARM_ARCH__>=5
2028	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
2029#else
2030	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
2031	tst	lr,#1
2032	moveq	pc,lr		@ be binary compatible with V4, yet
2033.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
2034#endif
2035.size	SHA3_absorb,.-SHA3_absorb
2036.globl	SHA3_squeeze
2037.type	SHA3_squeeze,%function
2038.align	5
2039SHA3_squeeze:
2040	stmdb	sp!,{r0,r3-r10,lr}
2041
2042	mov	r10,r0
2043	mov	r4,r1
2044	mov	r5,r2
2045	mov	r12,r3
2046
2047#ifdef	__thumb2__
2048	mov	r9,#0x00ff00ff
2049	mov	r8,#0x0f0f0f0f
2050	mov	r7,#0x33333333
2051	mov	r6,#0x55555555
2052#else
2053	mov	r6,#0x11		@ compose constants
2054	mov	r8,#0x0f
2055	mov	r9,#0xff
2056	orr	r6,r6,r6,lsl#8
2057	orr	r8,r8,r8,lsl#8
2058	orr	r6,r6,r6,lsl#16		@ 0x11111111
2059	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
2060	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
2061	orr	r7,r6,r6,lsl#1		@ 0x33333333
2062	orr	r6,r6,r6,lsl#2		@ 0x55555555
2063#endif
2064	stmdb	sp!,{r6,r7,r8,r9}
2065
2066	mov	r14,r10
2067	b	.Loop_squeeze
2068
2069.align	4
2070.Loop_squeeze:
2071	ldmia	r10!,{r0,r1}	@ A_flat[i++]
2072
2073	mov	r2,r0,lsl#16
2074	mov	r3,r1,lsl#16		@ r3 = r1 << 16
2075	mov	r2,r2,lsr#16		@ r2 = r0 & 0x0000ffff
2076	mov	r1,r1,lsr#16
2077	mov	r0,r0,lsr#16		@ r0 = r0 >> 16
2078	mov	r1,r1,lsl#16		@ r1 = r1 & 0xffff0000
2079
2080	orr	r2,r2,r2,lsl#8
2081	orr	r3,r3,r3,lsr#8
2082	orr	r0,r0,r0,lsl#8
2083	orr	r1,r1,r1,lsr#8
2084	and	r2,r2,r9		@ &=0x00ff00ff
2085	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
2086	and	r0,r0,r9		@ &=0x00ff00ff
2087	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
2088	orr	r2,r2,r2,lsl#4
2089	orr	r3,r3,r3,lsr#4
2090	orr	r0,r0,r0,lsl#4
2091	orr	r1,r1,r1,lsr#4
2092	and	r2,r2,r8		@ &=0x0f0f0f0f
2093	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
2094	and	r0,r0,r8		@ &=0x0f0f0f0f
2095	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
2096	orr	r2,r2,r2,lsl#2
2097	orr	r3,r3,r3,lsr#2
2098	orr	r0,r0,r0,lsl#2
2099	orr	r1,r1,r1,lsr#2
2100	and	r2,r2,r7		@ &=0x33333333
2101	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
2102	and	r0,r0,r7		@ &=0x33333333
2103	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
2104	orr	r2,r2,r2,lsl#1
2105	orr	r3,r3,r3,lsr#1
2106	orr	r0,r0,r0,lsl#1
2107	orr	r1,r1,r1,lsr#1
2108	and	r2,r2,r6		@ &=0x55555555
2109	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
2110	and	r0,r0,r6		@ &=0x55555555
2111	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
2112
2113	orr	r2,r2,r3
2114	orr	r0,r0,r1
2115
2116	cmp	r5,#8
2117	blo	.Lsqueeze_tail
2118	mov	r1,r2,lsr#8
2119	strb	r2,[r4],#1
2120	mov	r3,r2,lsr#16
2121	strb	r1,[r4],#1
2122	mov	r2,r2,lsr#24
2123	strb	r3,[r4],#1
2124	strb	r2,[r4],#1
2125
2126	mov	r1,r0,lsr#8
2127	strb	r0,[r4],#1
2128	mov	r3,r0,lsr#16
2129	strb	r1,[r4],#1
2130	mov	r0,r0,lsr#24
2131	strb	r3,[r4],#1
2132	strb	r0,[r4],#1
2133	subs	r5,r5,#8
2134	beq	.Lsqueeze_done
2135
2136	subs	r12,r12,#8		@ bsz -= 8
2137	bhi	.Loop_squeeze
2138
2139	mov	r0,r14			@ original r10
2140
2141	bl	KeccakF1600
2142
2143	ldmia	sp,{r6,r7,r8,r9,r10,r12}		@ restore constants and variables
2144	mov	r14,r10
2145	b	.Loop_squeeze
2146
2147.align	4
2148.Lsqueeze_tail:
2149	strb	r2,[r4],#1
2150	mov	r2,r2,lsr#8
2151	subs	r5,r5,#1
2152	beq	.Lsqueeze_done
2153	strb	r2,[r4],#1
2154	mov	r2,r2,lsr#8
2155	subs	r5,r5,#1
2156	beq	.Lsqueeze_done
2157	strb	r2,[r4],#1
2158	mov	r2,r2,lsr#8
2159	subs	r5,r5,#1
2160	beq	.Lsqueeze_done
2161	strb	r2,[r4],#1
2162	subs	r5,r5,#1
2163	beq	.Lsqueeze_done
2164
2165	strb	r0,[r4],#1
2166	mov	r0,r0,lsr#8
2167	subs	r5,r5,#1
2168	beq	.Lsqueeze_done
2169	strb	r0,[r4],#1
2170	mov	r0,r0,lsr#8
2171	subs	r5,r5,#1
2172	beq	.Lsqueeze_done
2173	strb	r0,[r4]
2174	b	.Lsqueeze_done
2175
2176.align	4
2177.Lsqueeze_done:
2178	add	sp,sp,#24
2179#if __ARM_ARCH__>=5
2180	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
2181#else
2182	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
2183	tst	lr,#1
2184	moveq	pc,lr		@ be binary compatible with V4, yet
2185.word	0xe12fff1e		@ interoperable with Thumb ISA:-)
2186#endif
2187.size	SHA3_squeeze,.-SHA3_squeeze
2188#if __ARM_MAX_ARCH__>=7
2189.fpu	neon
2190
2191.type	iotas64, %object
2192.align	5
2193iotas64:
2194.quad	0x0000000000000001
2195.quad	0x0000000000008082
2196.quad	0x800000000000808a
2197.quad	0x8000000080008000
2198.quad	0x000000000000808b
2199.quad	0x0000000080000001
2200.quad	0x8000000080008081
2201.quad	0x8000000000008009
2202.quad	0x000000000000008a
2203.quad	0x0000000000000088
2204.quad	0x0000000080008009
2205.quad	0x000000008000000a
2206.quad	0x000000008000808b
2207.quad	0x800000000000008b
2208.quad	0x8000000000008089
2209.quad	0x8000000000008003
2210.quad	0x8000000000008002
2211.quad	0x8000000000000080
2212.quad	0x000000000000800a
2213.quad	0x800000008000000a
2214.quad	0x8000000080008081
2215.quad	0x8000000000008080
2216.quad	0x0000000080000001
2217.quad	0x8000000080008008
2218.size	iotas64,.-iotas64
2219
2220.type	KeccakF1600_neon, %function
2221.align	5
2222KeccakF1600_neon:
2223	add	r1, r0, #16
2224	adr	r2, iotas64
2225	mov	r3, #24			@ loop counter
2226	b	.Loop_neon
2227
2228.align	4
2229.Loop_neon:
2230	@ Theta
2231	vst1.64	{q4},  [r0,:64]		@ offload A[0..1][4]
2232	veor	q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
2233	vst1.64	{d18}, [r1,:64]		@ offload A[2][4]
2234	veor	q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
2235	veor	q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
2236	veor	d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
2237	veor	d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
2238	veor	q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
2239	veor	q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
2240	veor	d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
2241	veor	d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
2242	veor	d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
2243	veor	q13, q13, q10		@ C[0..1]^=A[4][0..1]
2244	veor	q14, q15, q11		@ C[2..3]^=A[4][2..3]
2245	veor	d25, d25, d24		@ C[4]^=A[4][4]
2246
2247	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
2248	vadd.u64	q15, q14, q14		@ C[2..3]<<1
2249	vadd.u64	d18, d25, d25		@ C[4]<<1
2250	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
2251	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
2252	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
2253	veor	d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
2254	veor	q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
2255	veor	d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
2256	veor	d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
2257
2258	veor	d0,  d0,  d25		@ A[0][0] ^= C[4]
2259	veor	d1,  d1,  d25		@ A[1][0] ^= C[4]
2260	veor	d10, d10, d25		@ A[2][0] ^= C[4]
2261	veor	d11, d11, d25		@ A[3][0] ^= C[4]
2262	veor	d20, d20, d25		@ A[4][0] ^= C[4]
2263
2264	veor	d2,  d2,  d26		@ A[0][1] ^= D[1]
2265	veor	d3,  d3,  d26		@ A[1][1] ^= D[1]
2266	veor	d12, d12, d26		@ A[2][1] ^= D[1]
2267	veor	d13, d13, d26		@ A[3][1] ^= D[1]
2268	veor	d21, d21, d26		@ A[4][1] ^= D[1]
2269	vmov	d26, d27
2270
2271	veor	d6,  d6,  d28		@ A[0][3] ^= C[2]
2272	veor	d7,  d7,  d28		@ A[1][3] ^= C[2]
2273	veor	d16, d16, d28		@ A[2][3] ^= C[2]
2274	veor	d17, d17, d28		@ A[3][3] ^= C[2]
2275	veor	d23, d23, d28		@ A[4][3] ^= C[2]
2276	vld1.64	{q4},  [r0,:64]		@ restore A[0..1][4]
2277	vmov	d28, d29
2278
2279	vld1.64	{d18}, [r1,:64]		@ restore A[2][4]
2280	veor	q2,  q2,  q13		@ A[0..1][2] ^= D[2]
2281	veor	q7,  q7,  q13		@ A[2..3][2] ^= D[2]
2282	veor	d22, d22, d27		@ A[4][2]    ^= D[2]
2283
2284	veor	q4,  q4,  q14		@ A[0..1][4] ^= C[3]
2285	veor	q9,  q9,  q14		@ A[2..3][4] ^= C[3]
2286	veor	d24, d24, d29		@ A[4][4]    ^= C[3]
2287
2288	@ Rho + Pi
2289	vmov	d26, d2			@ C[1] = A[0][1]
2290	vshl.u64	d2,  d3,  #44
2291	vmov	d27, d4			@ C[2] = A[0][2]
2292	vshl.u64	d4,  d14, #43
2293	vmov	d28, d6			@ C[3] = A[0][3]
2294	vshl.u64	d6,  d17, #21
2295	vmov	d29, d8			@ C[4] = A[0][4]
2296	vshl.u64	d8,  d24, #14
2297	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
2298	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
2299	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
2300	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
2301
2302	vshl.u64	d3,  d9,  #20
2303	vshl.u64	d14, d16, #25
2304	vshl.u64	d17, d15, #15
2305	vshl.u64	d24, d21, #2
2306	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
2307	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
2308	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
2309	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
2310
2311	vshl.u64	d9,  d22, #61
2312	@ vshl.u64	d16, d19, #8
2313	vshl.u64	d15, d12, #10
2314	vshl.u64	d21, d7,  #55
2315	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
2316	vext.8	d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
2317	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
2318	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
2319
2320	vshl.u64	d22, d18, #39
2321	@ vshl.u64	d19, d23, #56
2322	vshl.u64	d12, d5,  #6
2323	vshl.u64	d7,  d13, #45
2324	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
2325	vext.8	d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
2326	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
2327	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
2328
2329	vshl.u64	d18, d20, #18
2330	vshl.u64	d23, d11, #41
2331	vshl.u64	d5,  d10, #3
2332	vshl.u64	d13, d1,  #36
2333	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
2334	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
2335	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
2336	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
2337
2338	vshl.u64	d1,  d28, #28
2339	vshl.u64	d10, d26, #1
2340	vshl.u64	d11, d29, #27
2341	vshl.u64	d20, d27, #62
2342	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
2343	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
2344	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
2345	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
2346
2347	@ Chi + Iota
2348	vbic	q13, q2,  q1
2349	vbic	q14, q3,  q2
2350	vbic	q15, q4,  q3
2351	veor	q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
2352	veor	q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
2353	veor	q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
2354	vst1.64	{q13}, [r0,:64]		@ offload A[0..1][0]
2355	vbic	q13, q0,  q4
2356	vbic	q15, q1,  q0
2357	vmov	q1,  q14		@ A[0..1][1]
2358	veor	q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
2359	veor	q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
2360
2361	vbic	q13, q7,  q6
2362	vmov	q0,  q5			@ A[2..3][0]
2363	vbic	q14, q8,  q7
2364	vmov	q15, q6			@ A[2..3][1]
2365	veor	q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
2366	vbic	q13, q9,  q8
2367	veor	q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
2368	vbic	q14, q0,  q9
2369	veor	q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
2370	vbic	q13, q15, q0
2371	veor	q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
2372	vmov	q14, q10		@ A[4][0..1]
2373	veor	q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
2374
2375	vld1.64	d25, [r2,:64]!		@ Iota[i++]
2376	vbic	d26, d22, d21
2377	vbic	d27, d23, d22
2378	vld1.64	{q0}, [r0,:64]		@ restore A[0..1][0]
2379	veor	d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
2380	vbic	d26, d24, d23
2381	veor	d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
2382	vbic	d27, d28, d24
2383	veor	d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
2384	vbic	d26, d29, d28
2385	veor	d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
2386	veor	d0,  d0,  d25		@ A[0][0] ^= Iota[i]
2387	veor	d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
2388
2389	subs	r3, r3, #1
2390	bne	.Loop_neon
2391
2392	bx	lr
2393.size	KeccakF1600_neon,.-KeccakF1600_neon
2394
2395.globl	SHA3_absorb_neon
2396.type	SHA3_absorb_neon, %function
2397.align	5
2398SHA3_absorb_neon:
2399	stmdb	sp!, {r4,r5,r6,lr}
2400	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2401
2402	mov	r4, r1			@ inp
2403	mov	r5, r2			@ len
2404	mov	r6, r3			@ bsz
2405
2406	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
2407	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
2408	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
2409	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
2410	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
2411
2412	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
2413	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
2414	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
2415	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
2416	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
2417
2418	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
2419	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
2420	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
2421	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
2422	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
2423
2424	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
2425	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
2426	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
2427	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
2428	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
2429
2430	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..3]
2431	vld1.32	{d24}, [r0,:64]		@ A[4][4]
2432	sub	r0, r0, #24*8		@ rewind
2433	b	.Loop_absorb_neon
2434
2435.align	4
2436.Loop_absorb_neon:
2437	subs	r12, r5, r6		@ len - bsz
2438	blo	.Labsorbed_neon
2439	mov	r5, r12
2440
2441	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
2442	cmp	r6, #8*2
2443	veor	d0, d0, d31		@ A[0][0] ^= *inp++
2444	blo	.Lprocess_neon
2445	vld1.8	{d31}, [r4]!
2446	veor	d2, d2, d31		@ A[0][1] ^= *inp++
2447	beq	.Lprocess_neon
2448	vld1.8	{d31}, [r4]!
2449	cmp	r6, #8*4
2450	veor	d4, d4, d31		@ A[0][2] ^= *inp++
2451	blo	.Lprocess_neon
2452	vld1.8	{d31}, [r4]!
2453	veor	d6, d6, d31		@ A[0][3] ^= *inp++
2454	beq	.Lprocess_neon
2455	vld1.8	{d31},[r4]!
2456	cmp	r6, #8*6
2457	veor	d8, d8, d31		@ A[0][4] ^= *inp++
2458	blo	.Lprocess_neon
2459
2460	vld1.8	{d31}, [r4]!
2461	veor	d1, d1, d31		@ A[1][0] ^= *inp++
2462	beq	.Lprocess_neon
2463	vld1.8	{d31}, [r4]!
2464	cmp	r6, #8*8
2465	veor	d3, d3, d31		@ A[1][1] ^= *inp++
2466	blo	.Lprocess_neon
2467	vld1.8	{d31}, [r4]!
2468	veor	d5, d5, d31		@ A[1][2] ^= *inp++
2469	beq	.Lprocess_neon
2470	vld1.8	{d31}, [r4]!
2471	cmp	r6, #8*10
2472	veor	d7, d7, d31		@ A[1][3] ^= *inp++
2473	blo	.Lprocess_neon
2474	vld1.8	{d31}, [r4]!
2475	veor	d9, d9, d31		@ A[1][4] ^= *inp++
2476	beq	.Lprocess_neon
2477
2478	vld1.8	{d31}, [r4]!
2479	cmp	r6, #8*12
2480	veor	d10, d10, d31		@ A[2][0] ^= *inp++
2481	blo	.Lprocess_neon
2482	vld1.8	{d31}, [r4]!
2483	veor	d12, d12, d31		@ A[2][1] ^= *inp++
2484	beq	.Lprocess_neon
2485	vld1.8	{d31}, [r4]!
2486	cmp	r6, #8*14
2487	veor	d14, d14, d31		@ A[2][2] ^= *inp++
2488	blo	.Lprocess_neon
2489	vld1.8	{d31}, [r4]!
2490	veor	d16, d16, d31		@ A[2][3] ^= *inp++
2491	beq	.Lprocess_neon
2492	vld1.8	{d31}, [r4]!
2493	cmp	r6, #8*16
2494	veor	d18, d18, d31		@ A[2][4] ^= *inp++
2495	blo	.Lprocess_neon
2496
2497	vld1.8	{d31}, [r4]!
2498	veor	d11, d11, d31		@ A[3][0] ^= *inp++
2499	beq	.Lprocess_neon
2500	vld1.8	{d31}, [r4]!
2501	cmp	r6, #8*18
2502	veor	d13, d13, d31		@ A[3][1] ^= *inp++
2503	blo	.Lprocess_neon
2504	vld1.8	{d31}, [r4]!
2505	veor	d15, d15, d31		@ A[3][2] ^= *inp++
2506	beq	.Lprocess_neon
2507	vld1.8	{d31}, [r4]!
2508	cmp	r6, #8*20
2509	veor	d17, d17, d31		@ A[3][3] ^= *inp++
2510	blo	.Lprocess_neon
2511	vld1.8	{d31}, [r4]!
2512	veor	d19, d19, d31		@ A[3][4] ^= *inp++
2513	beq	.Lprocess_neon
2514
2515	vld1.8	{d31}, [r4]!
2516	cmp	r6, #8*22
2517	veor	d20, d20, d31		@ A[4][0] ^= *inp++
2518	blo	.Lprocess_neon
2519	vld1.8	{d31}, [r4]!
2520	veor	d21, d21, d31		@ A[4][1] ^= *inp++
2521	beq	.Lprocess_neon
2522	vld1.8	{d31}, [r4]!
2523	cmp	r6, #8*24
2524	veor	d22, d22, d31		@ A[4][2] ^= *inp++
2525	blo	.Lprocess_neon
2526	vld1.8	{d31}, [r4]!
2527	veor	d23, d23, d31		@ A[4][3] ^= *inp++
2528	beq	.Lprocess_neon
2529	vld1.8	{d31}, [r4]!
2530	veor	d24, d24, d31		@ A[4][4] ^= *inp++
2531
2532.Lprocess_neon:
2533	bl	KeccakF1600_neon
2534	b	.Loop_absorb_neon
2535
2536.align	4
2537.Labsorbed_neon:
2538	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2539	vst1.32	{d2}, [r0,:64]!
2540	vst1.32	{d4}, [r0,:64]!
2541	vst1.32	{d6}, [r0,:64]!
2542	vst1.32	{d8}, [r0,:64]!
2543
2544	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2545	vst1.32	{d3}, [r0,:64]!
2546	vst1.32	{d5}, [r0,:64]!
2547	vst1.32	{d7}, [r0,:64]!
2548	vst1.32	{d9}, [r0,:64]!
2549
2550	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2551	vst1.32	{d12}, [r0,:64]!
2552	vst1.32	{d14}, [r0,:64]!
2553	vst1.32	{d16}, [r0,:64]!
2554	vst1.32	{d18}, [r0,:64]!
2555
2556	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2557	vst1.32	{d13}, [r0,:64]!
2558	vst1.32	{d15}, [r0,:64]!
2559	vst1.32	{d17}, [r0,:64]!
2560	vst1.32	{d19}, [r0,:64]!
2561
2562	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2563	vst1.32	{d24}, [r0,:64]
2564
2565	mov	r0, r5			@ return value
2566	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2567	ldmia	sp!, {r4,r5,r6,pc}
2568.size	SHA3_absorb_neon,.-SHA3_absorb_neon
2569
2570.globl	SHA3_squeeze_neon
2571.type	SHA3_squeeze_neon, %function
2572.align	5
2573SHA3_squeeze_neon:
2574	stmdb	sp!, {r4,r5,r6,lr}
2575
2576	mov	r4, r1			@ out
2577	mov	r5, r2			@ len
2578	mov	r6, r3			@ bsz
2579	mov	r12, r0			@ A_flat
2580	mov	r14, r3			@ bsz
2581	b	.Loop_squeeze_neon
2582
2583.align	4
2584.Loop_squeeze_neon:
2585	cmp	r5, #8
2586	blo	.Lsqueeze_neon_tail
2587	vld1.32	{d0}, [r12]!
2588	vst1.8	{d0}, [r4]!		@ endian-neutral store
2589
2590	subs	r5, r5, #8		@ len -= 8
2591	beq	.Lsqueeze_neon_done
2592
2593	subs	r14, r14, #8		@ bsz -= 8
2594	bhi	.Loop_squeeze_neon
2595
2596	vstmdb	sp!,  {d8,d9,d10,d11,d12,d13,d14,d15}
2597
2598	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2599	vld1.32	{d2}, [r0,:64]!
2600	vld1.32	{d4}, [r0,:64]!
2601	vld1.32	{d6}, [r0,:64]!
2602	vld1.32	{d8}, [r0,:64]!
2603
2604	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2605	vld1.32	{d3}, [r0,:64]!
2606	vld1.32	{d5}, [r0,:64]!
2607	vld1.32	{d7}, [r0,:64]!
2608	vld1.32	{d9}, [r0,:64]!
2609
2610	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2611	vld1.32	{d12}, [r0,:64]!
2612	vld1.32	{d14}, [r0,:64]!
2613	vld1.32	{d16}, [r0,:64]!
2614	vld1.32	{d18}, [r0,:64]!
2615
2616	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2617	vld1.32	{d13}, [r0,:64]!
2618	vld1.32	{d15}, [r0,:64]!
2619	vld1.32	{d17}, [r0,:64]!
2620	vld1.32	{d19}, [r0,:64]!
2621
2622	vld1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2623	vld1.32	{d24}, [r0,:64]
2624	sub	r0, r0, #24*8		@ rewind
2625
2626	bl	KeccakF1600_neon
2627
2628	mov	r12, r0			@ A_flat
2629	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
2630	vst1.32	{d2}, [r0,:64]!
2631	vst1.32	{d4}, [r0,:64]!
2632	vst1.32	{d6}, [r0,:64]!
2633	vst1.32	{d8}, [r0,:64]!
2634
2635	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
2636	vst1.32	{d3}, [r0,:64]!
2637	vst1.32	{d5}, [r0,:64]!
2638	vst1.32	{d7}, [r0,:64]!
2639	vst1.32	{d9}, [r0,:64]!
2640
2641	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
2642	vst1.32	{d12}, [r0,:64]!
2643	vst1.32	{d14}, [r0,:64]!
2644	vst1.32	{d16}, [r0,:64]!
2645	vst1.32	{d18}, [r0,:64]!
2646
2647	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
2648	vst1.32	{d13}, [r0,:64]!
2649	vst1.32	{d15}, [r0,:64]!
2650	vst1.32	{d17}, [r0,:64]!
2651	vst1.32	{d19}, [r0,:64]!
2652
2653	vst1.32	{d20,d21,d22,d23}, [r0,:64]!	@ A[4][0..4]
2654	mov	r14, r6			@ bsz
2655	vst1.32	{d24}, [r0,:64]
2656	mov	r0,  r12		@ rewind
2657
2658	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
2659	b	.Loop_squeeze_neon
2660
2661.align	4
2662.Lsqueeze_neon_tail:
2663	ldmia	r12, {r2,r3}
2664	cmp	r5, #2
2665	strb	r2, [r4],#1		@ endian-neutral store
2666	mov	r2, r2, lsr#8
2667	blo	.Lsqueeze_neon_done
2668	strb	r2, [r4], #1
2669	mov	r2, r2, lsr#8
2670	beq	.Lsqueeze_neon_done
2671	strb	r2, [r4], #1
2672	mov	r2, r2, lsr#8
2673	cmp	r5, #4
2674	blo	.Lsqueeze_neon_done
2675	strb	r2, [r4], #1
2676	beq	.Lsqueeze_neon_done
2677
2678	strb	r3, [r4], #1
2679	mov	r3, r3, lsr#8
2680	cmp	r5, #6
2681	blo	.Lsqueeze_neon_done
2682	strb	r3, [r4], #1
2683	mov	r3, r3, lsr#8
2684	beq	.Lsqueeze_neon_done
2685	strb	r3, [r4], #1
2686
2687.Lsqueeze_neon_done:
2688	ldmia	sp!, {r4,r5,r6,pc}
2689.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
2690#endif
2691.byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2692.align	2
2693.align	2
2694