• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2014
18#
19# ChaCha20 for ARMv4.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24#
25# Cortex-A5		19.3(*)/+95%    21.8        14.1
26# Cortex-A8		10.5(*)/+160%   13.9        6.35
27# Cortex-A9		12.9(**)/+110%  14.3        6.50
28# Cortex-A15		11.0/+40%       16.0        5.00
29# Snapdragon S4		11.5/+125%      13.6        4.90
30#
31# (*)	most "favourable" result for aligned data on little-endian
32#	processor, result for misaligned data is 10-15% lower;
33# (**)	this result is a trade-off: it can be improved by 20%,
34#	but then Snapdragon S4 and Cortex-A8 results get
35#	20-25% worse;
36
37$flavour = shift;
38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41if ($flavour && $flavour ne "void") {
42    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45    die "can't locate arm-xlate.pl";
46
47    open STDOUT,"| \"$^X\" $xlate $flavour $output";
48} else {
49    open STDOUT,">$output";
50}
51
52sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
53{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
54  my $arg = pop;
55    $arg = "#$arg" if ($arg*1 eq $arg);
56    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
57}
58
59my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
60my @t=map("r$_",(8..11));
61
62sub ROUND {
63my ($a0,$b0,$c0,$d0)=@_;
64my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
65my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
66my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
67my $odd = $d0&1;
68my ($xc,$xc_) = (@t[0..1]);
69my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
70my @ret;
71
72	# Consider order in which variables are addressed by their
73	# index:
74	#
75	#       a   b   c   d
76	#
77	#       0   4   8  12 < even round
78	#       1   5   9  13
79	#       2   6  10  14
80	#       3   7  11  15
81	#       0   5  10  15 < odd round
82	#       1   6  11  12
83	#       2   7   8  13
84	#       3   4   9  14
85	#
86	# 'a', 'b' are permanently allocated in registers, @x[0..7],
87	# while 'c's and pair of 'd's are maintained in memory. If
88	# you observe 'c' column, you'll notice that pair of 'c's is
89	# invariant between rounds. This means that we have to reload
90	# them once per round, in the middle. This is why you'll see
91	# bunch of 'c' stores and loads in the middle, but none in
92	# the beginning or end. If you observe 'd' column, you'll
93	# notice that 15 and 13 are reused in next pair of rounds.
94	# This is why these two are chosen for offloading to memory,
95	# to make loads count more.
96							push @ret,(
97	"&add	(@x[$a0],@x[$a0],@x[$b0])",
98	"&mov	($xd,$xd,'ror#16')",
99	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
100	 "&mov	($xd_,$xd_,'ror#16')",
101	"&eor	($xd,$xd,@x[$a0],'ror#16')",
102	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
103
104	"&add	($xc,$xc,$xd)",
105	"&mov	(@x[$b0],@x[$b0],'ror#20')",
106	 "&add	($xc_,$xc_,$xd_)",
107	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
108	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
109	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
110
111	"&add	(@x[$a0],@x[$a0],@x[$b0])",
112	"&mov	($xd,$xd,'ror#24')",
113	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
114	 "&mov	($xd_,$xd_,'ror#24')",
115	"&eor	($xd,$xd,@x[$a0],'ror#24')",
116	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
117
118	"&add	($xc,$xc,$xd)",
119	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
120							push @ret,(
121	"&str	($xd,'[sp,#4*(16+$d0)]')",
122	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
123							push @ret,(
124	 "&add	($xc_,$xc_,$xd_)",
125	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
126							push @ret,(
127	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
128	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
129							push @ret,(
130	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
131	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
132
133	$xd=@x[$d2]					if (!$odd);
134	$xd_=@x[$d3]					if ($odd);
135							push @ret,(
136	"&str	($xc,'[sp,#4*(16+$c0)]')",
137	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
138	"&add	(@x[$a2],@x[$a2],@x[$b2])",
139	"&mov	($xd,$xd,'ror#16')",
140	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
141	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
142	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
143	 "&mov	($xd_,$xd_,'ror#16')",
144	"&eor	($xd,$xd,@x[$a2],'ror#16')",
145	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
146
147	"&add	($xc,$xc,$xd)",
148	"&mov	(@x[$b2],@x[$b2],'ror#20')",
149	 "&add	($xc_,$xc_,$xd_)",
150	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
151	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
152	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
153
154	"&add	(@x[$a2],@x[$a2],@x[$b2])",
155	"&mov	($xd,$xd,'ror#24')",
156	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
157	 "&mov	($xd_,$xd_,'ror#24')",
158	"&eor	($xd,$xd,@x[$a2],'ror#24')",
159	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
160
161	"&add	($xc,$xc,$xd)",
162	"&mov	(@x[$b2],@x[$b2],'ror#25')",
163	 "&add	($xc_,$xc_,$xd_)",
164	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
165	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
166	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
167
168	@ret;
169}
170
171$code.=<<___;
172#include <openssl/arm_arch.h>
173
174@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
175@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
176.arch  armv7-a
177
178.text
179#if defined(__thumb2__) || defined(__clang__)
180.syntax	unified
181#endif
182#if defined(__thumb2__)
183.thumb
184#else
185.code	32
186#endif
187
188#if defined(__thumb2__) || defined(__clang__)
189#define ldrhsb	ldrbhs
190#endif
191
192.align	5
193.Lsigma:
194.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
195.Lone:
196.long	1,0,0,0
197#if __ARM_MAX_ARCH__>=7
198.LOPENSSL_armcap:
199.word   OPENSSL_armcap_P-.LChaCha20_ctr32
200#else
201.word	-1
202#endif
203
204.globl	ChaCha20_ctr32
205.type	ChaCha20_ctr32,%function
206.align	5
207ChaCha20_ctr32:
208.LChaCha20_ctr32:
209	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
210	stmdb	sp!,{r0-r2,r4-r11,lr}
211#if __ARM_ARCH__<7 && !defined(__thumb2__)
212	sub	r14,pc,#16		@ ChaCha20_ctr32
213#else
214	adr	r14,.LChaCha20_ctr32
215#endif
216	cmp	r2,#0			@ len==0?
217#ifdef	__thumb2__
218	itt	eq
219#endif
220	addeq	sp,sp,#4*3
221	beq	.Lno_data
222#if __ARM_MAX_ARCH__>=7
223	cmp	r2,#192			@ test len
224	bls	.Lshort
225	ldr	r4,[r14,#-32]
226	ldr	r4,[r14,r4]
227# ifdef	__APPLE__
228	ldr	r4,[r4]
229# endif
230	tst	r4,#ARMV7_NEON
231	bne	.LChaCha20_neon
232.Lshort:
233#endif
234	ldmia	r12,{r4-r7}		@ load counter and nonce
235	sub	sp,sp,#4*(16)		@ off-load area
236	sub	r14,r14,#64		@ .Lsigma
237	stmdb	sp!,{r4-r7}		@ copy counter and nonce
238	ldmia	r3,{r4-r11}		@ load key
239	ldmia	r14,{r0-r3}		@ load sigma
240	stmdb	sp!,{r4-r11}		@ copy key
241	stmdb	sp!,{r0-r3}		@ copy sigma
242	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
243	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
244	b	.Loop_outer_enter
245
246.align	4
247.Loop_outer:
248	ldmia	sp,{r0-r9}		@ load key material
249	str	@t[3],[sp,#4*(32+2)]	@ save len
250	str	r12,  [sp,#4*(32+1)]	@ save inp
251	str	r14,  [sp,#4*(32+0)]	@ save out
252.Loop_outer_enter:
253	ldr	@t[3], [sp,#4*(15)]
254	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
255	ldr	@t[2], [sp,#4*(13)]
256	ldr	@x[14],[sp,#4*(14)]
257	str	@t[3], [sp,#4*(16+15)]
258	mov	@t[3],#10
259	b	.Loop
260
261.align	4
262.Loop:
263	subs	@t[3],@t[3],#1
264___
265	foreach (&ROUND(0, 4, 8,12)) { eval; }
266	foreach (&ROUND(0, 5,10,15)) { eval; }
267$code.=<<___;
268	bne	.Loop
269
270	ldr	@t[3],[sp,#4*(32+2)]	@ load len
271
272	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
273	str	@t[1], [sp,#4*(16+9)]
274	str	@x[12],[sp,#4*(16+12)]
275	str	@t[2], [sp,#4*(16+13)]
276	str	@x[14],[sp,#4*(16+14)]
277
278	@ at this point we have first half of 512-bit result in
279	@ @x[0-7] and second half at sp+4*(16+8)
280
281	cmp	@t[3],#64		@ done yet?
282#ifdef	__thumb2__
283	itete	lo
284#endif
285	addlo	r12,sp,#4*(0)		@ shortcut or ...
286	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
287	addlo	r14,sp,#4*(0)		@ shortcut or ...
288	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
289
290	ldr	@t[0],[sp,#4*(0)]	@ load key material
291	ldr	@t[1],[sp,#4*(1)]
292
293#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
294# if __ARM_ARCH__<7
295	orr	@t[2],r12,r14
296	tst	@t[2],#3		@ are input and output aligned?
297	ldr	@t[2],[sp,#4*(2)]
298	bne	.Lunaligned
299	cmp	@t[3],#64		@ restore flags
300# else
301	ldr	@t[2],[sp,#4*(2)]
302# endif
303	ldr	@t[3],[sp,#4*(3)]
304
305	add	@x[0],@x[0],@t[0]	@ accumulate key material
306	add	@x[1],@x[1],@t[1]
307# ifdef	__thumb2__
308	itt	hs
309# endif
310	ldrhs	@t[0],[r12],#16		@ load input
311	ldrhs	@t[1],[r12,#-12]
312
313	add	@x[2],@x[2],@t[2]
314	add	@x[3],@x[3],@t[3]
315# ifdef	__thumb2__
316	itt	hs
317# endif
318	ldrhs	@t[2],[r12,#-8]
319	ldrhs	@t[3],[r12,#-4]
320# if __ARM_ARCH__>=6 && defined(__ARMEB__)
321	rev	@x[0],@x[0]
322	rev	@x[1],@x[1]
323	rev	@x[2],@x[2]
324	rev	@x[3],@x[3]
325# endif
326# ifdef	__thumb2__
327	itt	hs
328# endif
329	eorhs	@x[0],@x[0],@t[0]	@ xor with input
330	eorhs	@x[1],@x[1],@t[1]
331	 add	@t[0],sp,#4*(4)
332	str	@x[0],[r14],#16		@ store output
333# ifdef	__thumb2__
334	itt	hs
335# endif
336	eorhs	@x[2],@x[2],@t[2]
337	eorhs	@x[3],@x[3],@t[3]
338	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
339	str	@x[1],[r14,#-12]
340	str	@x[2],[r14,#-8]
341	str	@x[3],[r14,#-4]
342
343	add	@x[4],@x[4],@t[0]	@ accumulate key material
344	add	@x[5],@x[5],@t[1]
345# ifdef	__thumb2__
346	itt	hs
347# endif
348	ldrhs	@t[0],[r12],#16		@ load input
349	ldrhs	@t[1],[r12,#-12]
350	add	@x[6],@x[6],@t[2]
351	add	@x[7],@x[7],@t[3]
352# ifdef	__thumb2__
353	itt	hs
354# endif
355	ldrhs	@t[2],[r12,#-8]
356	ldrhs	@t[3],[r12,#-4]
357# if __ARM_ARCH__>=6 && defined(__ARMEB__)
358	rev	@x[4],@x[4]
359	rev	@x[5],@x[5]
360	rev	@x[6],@x[6]
361	rev	@x[7],@x[7]
362# endif
363# ifdef	__thumb2__
364	itt	hs
365# endif
366	eorhs	@x[4],@x[4],@t[0]
367	eorhs	@x[5],@x[5],@t[1]
368	 add	@t[0],sp,#4*(8)
369	str	@x[4],[r14],#16		@ store output
370# ifdef	__thumb2__
371	itt	hs
372# endif
373	eorhs	@x[6],@x[6],@t[2]
374	eorhs	@x[7],@x[7],@t[3]
375	str	@x[5],[r14,#-12]
376	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
377	str	@x[6],[r14,#-8]
378	 add	@x[0],sp,#4*(16+8)
379	str	@x[7],[r14,#-4]
380
381	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
382
383	add	@x[0],@x[0],@t[0]	@ accumulate key material
384	add	@x[1],@x[1],@t[1]
385# ifdef	__thumb2__
386	itt	hs
387# endif
388	ldrhs	@t[0],[r12],#16		@ load input
389	ldrhs	@t[1],[r12,#-12]
390# ifdef	__thumb2__
391	itt	hi
392# endif
393	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
394	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
395	add	@x[2],@x[2],@t[2]
396	add	@x[3],@x[3],@t[3]
397# ifdef	__thumb2__
398	itt	hs
399# endif
400	ldrhs	@t[2],[r12,#-8]
401	ldrhs	@t[3],[r12,#-4]
402# if __ARM_ARCH__>=6 && defined(__ARMEB__)
403	rev	@x[0],@x[0]
404	rev	@x[1],@x[1]
405	rev	@x[2],@x[2]
406	rev	@x[3],@x[3]
407# endif
408# ifdef	__thumb2__
409	itt	hs
410# endif
411	eorhs	@x[0],@x[0],@t[0]
412	eorhs	@x[1],@x[1],@t[1]
413	 add	@t[0],sp,#4*(12)
414	str	@x[0],[r14],#16		@ store output
415# ifdef	__thumb2__
416	itt	hs
417# endif
418	eorhs	@x[2],@x[2],@t[2]
419	eorhs	@x[3],@x[3],@t[3]
420	str	@x[1],[r14,#-12]
421	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
422	str	@x[2],[r14,#-8]
423	str	@x[3],[r14,#-4]
424
425	add	@x[4],@x[4],@t[0]	@ accumulate key material
426	add	@x[5],@x[5],@t[1]
427# ifdef	__thumb2__
428	itt	hi
429# endif
430	 addhi	@t[0],@t[0],#1		@ next counter value
431	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
432# ifdef	__thumb2__
433	itt	hs
434# endif
435	ldrhs	@t[0],[r12],#16		@ load input
436	ldrhs	@t[1],[r12,#-12]
437	add	@x[6],@x[6],@t[2]
438	add	@x[7],@x[7],@t[3]
439# ifdef	__thumb2__
440	itt	hs
441# endif
442	ldrhs	@t[2],[r12,#-8]
443	ldrhs	@t[3],[r12,#-4]
444# if __ARM_ARCH__>=6 && defined(__ARMEB__)
445	rev	@x[4],@x[4]
446	rev	@x[5],@x[5]
447	rev	@x[6],@x[6]
448	rev	@x[7],@x[7]
449# endif
450# ifdef	__thumb2__
451	itt	hs
452# endif
453	eorhs	@x[4],@x[4],@t[0]
454	eorhs	@x[5],@x[5],@t[1]
455# ifdef	__thumb2__
456	 it	ne
457# endif
458	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
459# ifdef	__thumb2__
460	itt	hs
461# endif
462	eorhs	@x[6],@x[6],@t[2]
463	eorhs	@x[7],@x[7],@t[3]
464	str	@x[4],[r14],#16		@ store output
465	str	@x[5],[r14,#-12]
466# ifdef	__thumb2__
467	it	hs
468# endif
469	 subhs	@t[3],@t[0],#64		@ len-=64
470	str	@x[6],[r14,#-8]
471	str	@x[7],[r14,#-4]
472	bhi	.Loop_outer
473
474	beq	.Ldone
475# if __ARM_ARCH__<7
476	b	.Ltail
477
478.align	4
479.Lunaligned:				@ unaligned endian-neutral path
480	cmp	@t[3],#64		@ restore flags
481# endif
482#endif
483#if __ARM_ARCH__<7
484	ldr	@t[3],[sp,#4*(3)]
485___
486for ($i=0;$i<16;$i+=4) {
487my $j=$i&0x7;
488
489$code.=<<___	if ($i==4);
490	add	@x[0],sp,#4*(16+8)
491___
492$code.=<<___	if ($i==8);
493	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
494# ifdef	__thumb2__
495	itt	hi
496# endif
497	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
498	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
499___
500$code.=<<___;
501	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
502___
503$code.=<<___	if ($i==12);
504# ifdef	__thumb2__
505	itt	hi
506# endif
507	addhi	@t[0],@t[0],#1			@ next counter value
508	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
509___
510$code.=<<___;
511	add	@x[$j+1],@x[$j+1],@t[1]
512	add	@x[$j+2],@x[$j+2],@t[2]
513# ifdef	__thumb2__
514	itete	lo
515# endif
516	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
517	ldrhsb	@t[0],[r12],#16			@ ... load input
518	eorlo	@t[1],@t[1],@t[1]
519	ldrhsb	@t[1],[r12,#-12]
520
521	add	@x[$j+3],@x[$j+3],@t[3]
522# ifdef	__thumb2__
523	itete	lo
524# endif
525	eorlo	@t[2],@t[2],@t[2]
526	ldrhsb	@t[2],[r12,#-8]
527	eorlo	@t[3],@t[3],@t[3]
528	ldrhsb	@t[3],[r12,#-4]
529
530	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
531	eor	@x[$j+1],@t[1],@x[$j+1]
532# ifdef	__thumb2__
533	itt	hs
534# endif
535	ldrhsb	@t[0],[r12,#-15]		@ load more input
536	ldrhsb	@t[1],[r12,#-11]
537	eor	@x[$j+2],@t[2],@x[$j+2]
538	 strb	@x[$j+0],[r14],#16		@ store output
539	eor	@x[$j+3],@t[3],@x[$j+3]
540# ifdef	__thumb2__
541	itt	hs
542# endif
543	ldrhsb	@t[2],[r12,#-7]
544	ldrhsb	@t[3],[r12,#-3]
545	 strb	@x[$j+1],[r14,#-12]
546	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
547	 strb	@x[$j+2],[r14,#-8]
548	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
549# ifdef	__thumb2__
550	itt	hs
551# endif
552	ldrhsb	@t[0],[r12,#-14]		@ load more input
553	ldrhsb	@t[1],[r12,#-10]
554	 strb	@x[$j+3],[r14,#-4]
555	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
556	 strb	@x[$j+0],[r14,#-15]
557	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
558# ifdef	__thumb2__
559	itt	hs
560# endif
561	ldrhsb	@t[2],[r12,#-6]
562	ldrhsb	@t[3],[r12,#-2]
563	 strb	@x[$j+1],[r14,#-11]
564	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
565	 strb	@x[$j+2],[r14,#-7]
566	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
567# ifdef	__thumb2__
568	itt	hs
569# endif
570	ldrhsb	@t[0],[r12,#-13]		@ load more input
571	ldrhsb	@t[1],[r12,#-9]
572	 strb	@x[$j+3],[r14,#-3]
573	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
574	 strb	@x[$j+0],[r14,#-14]
575	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
576# ifdef	__thumb2__
577	itt	hs
578# endif
579	ldrhsb	@t[2],[r12,#-5]
580	ldrhsb	@t[3],[r12,#-1]
581	 strb	@x[$j+1],[r14,#-10]
582	 strb	@x[$j+2],[r14,#-6]
583	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
584	 strb	@x[$j+3],[r14,#-2]
585	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
586	 strb	@x[$j+0],[r14,#-13]
587	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
588	 strb	@x[$j+1],[r14,#-9]
589	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
590	 strb	@x[$j+2],[r14,#-5]
591	 strb	@x[$j+3],[r14,#-1]
592___
593$code.=<<___	if ($i<12);
594	add	@t[0],sp,#4*(4+$i)
595	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
596___
597}
598$code.=<<___;
599# ifdef	__thumb2__
600	it	ne
601# endif
602	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
603# ifdef	__thumb2__
604	it	hs
605# endif
606	subhs	@t[3],@t[0],#64			@ len-=64
607	bhi	.Loop_outer
608
609	beq	.Ldone
610#endif
611
612.Ltail:
613	ldr	r12,[sp,#4*(32+1)]	@ load inp
614	add	@t[1],sp,#4*(0)
615	ldr	r14,[sp,#4*(32+0)]	@ load out
616
617.Loop_tail:
618	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
619	ldrb	@t[3],[r12],#1		@ read input
620	subs	@t[0],@t[0],#1
621	eor	@t[3],@t[3],@t[2]
622	strb	@t[3],[r14],#1		@ store output
623	bne	.Loop_tail
624
625.Ldone:
626	add	sp,sp,#4*(32+3)
627.Lno_data:
628	ldmia	sp!,{r4-r11,pc}
629.size	ChaCha20_ctr32,.-ChaCha20_ctr32
630___
631
632{{{
633my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
634    map("q$_",(0..15));
635
636sub NEONROUND {
637my $odd = pop;
638my ($a,$b,$c,$d,$t)=@_;
639
640	(
641	"&vadd_i32	($a,$a,$b)",
642	"&veor		($d,$d,$a)",
643	"&vrev32_16	($d,$d)",	# vrot ($d,16)
644
645	"&vadd_i32	($c,$c,$d)",
646	"&veor		($t,$b,$c)",
647	"&vshr_u32	($b,$t,20)",
648	"&vsli_32	($b,$t,12)",
649
650	"&vadd_i32	($a,$a,$b)",
651	"&veor		($t,$d,$a)",
652	"&vshr_u32	($d,$t,24)",
653	"&vsli_32	($d,$t,8)",
654
655	"&vadd_i32	($c,$c,$d)",
656	"&veor		($t,$b,$c)",
657	"&vshr_u32	($b,$t,25)",
658	"&vsli_32	($b,$t,7)",
659
660	"&vext_8	($c,$c,$c,8)",
661	"&vext_8	($b,$b,$b,$odd?12:4)",
662	"&vext_8	($d,$d,$d,$odd?4:12)"
663	);
664}
665
666$code.=<<___;
667#if __ARM_MAX_ARCH__>=7
668.arch	armv7-a
669.fpu	neon
670
671.type	ChaCha20_neon,%function
672.align	5
673ChaCha20_neon:
674	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
675	stmdb		sp!,{r0-r2,r4-r11,lr}
676.LChaCha20_neon:
677	adr		r14,.Lsigma
678	vstmdb		sp!,{d8-d15}		@ ABI spec says so
679	stmdb		sp!,{r0-r3}
680
681	vld1.32		{$b0-$c0},[r3]		@ load key
682	ldmia		r3,{r4-r11}		@ load key
683
684	sub		sp,sp,#4*(16+16)
685	vld1.32		{$d0},[r12]		@ load counter and nonce
686	add		r12,sp,#4*8
687	ldmia		r14,{r0-r3}		@ load sigma
688	vld1.32		{$a0},[r14]!		@ load sigma
689	vld1.32		{$t0},[r14]		@ one
690	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
691	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
692
693	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
694	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
695	vshl.i32	$t1#lo,$t0#lo,#1	@ two
696	vstr		$t0#lo,[sp,#4*(16+0)]
697	vshl.i32	$t2#lo,$t0#lo,#2	@ four
698	vstr		$t1#lo,[sp,#4*(16+2)]
699	vmov		$a1,$a0
700	vstr		$t2#lo,[sp,#4*(16+4)]
701	vmov		$a2,$a0
702	vmov		$b1,$b0
703	vmov		$b2,$b0
704	b		.Loop_neon_enter
705
706.align	4
707.Loop_neon_outer:
708	ldmia		sp,{r0-r9}		@ load key material
709	cmp		@t[3],#64*2		@ if len<=64*2
710	bls		.Lbreak_neon		@ switch to integer-only
711	vmov		$a1,$a0
712	str		@t[3],[sp,#4*(32+2)]	@ save len
713	vmov		$a2,$a0
714	str		r12,  [sp,#4*(32+1)]	@ save inp
715	vmov		$b1,$b0
716	str		r14,  [sp,#4*(32+0)]	@ save out
717	vmov		$b2,$b0
718.Loop_neon_enter:
719	ldr		@t[3], [sp,#4*(15)]
720	vadd.i32	$d1,$d0,$t0		@ counter+1
721	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
722	vmov		$c1,$c0
723	ldr		@t[2], [sp,#4*(13)]
724	vmov		$c2,$c0
725	ldr		@x[14],[sp,#4*(14)]
726	vadd.i32	$d2,$d1,$t0		@ counter+2
727	str		@t[3], [sp,#4*(16+15)]
728	mov		@t[3],#10
729	add		@x[12],@x[12],#3	@ counter+3
730	b		.Loop_neon
731
732.align	4
733.Loop_neon:
734	subs		@t[3],@t[3],#1
735___
736	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
737	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
738	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
739	my @thread3=&ROUND(0,4,8,12);
740
741	foreach (@thread0) {
742		eval;			eval(shift(@thread3));
743		eval(shift(@thread1));	eval(shift(@thread3));
744		eval(shift(@thread2));	eval(shift(@thread3));
745	}
746
747	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
748	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
749	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
750	@thread3=&ROUND(0,5,10,15);
751
752	foreach (@thread0) {
753		eval;			eval(shift(@thread3));
754		eval(shift(@thread1));	eval(shift(@thread3));
755		eval(shift(@thread2));	eval(shift(@thread3));
756	}
757$code.=<<___;
758	bne		.Loop_neon
759
760	add		@t[3],sp,#32
761	vld1.32		{$t0-$t1},[sp]		@ load key material
762	vld1.32		{$t2-$t3},[@t[3]]
763
764	ldr		@t[3],[sp,#4*(32+2)]	@ load len
765
766	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
767	str		@t[1], [sp,#4*(16+9)]
768	str		@x[12],[sp,#4*(16+12)]
769	str		@t[2], [sp,#4*(16+13)]
770	str		@x[14],[sp,#4*(16+14)]
771
772	@ at this point we have first half of 512-bit result in
773	@ @x[0-7] and second half at sp+4*(16+8)
774
775	ldr		r12,[sp,#4*(32+1)]	@ load inp
776	ldr		r14,[sp,#4*(32+0)]	@ load out
777
778	vadd.i32	$a0,$a0,$t0		@ accumulate key material
779	vadd.i32	$a1,$a1,$t0
780	vadd.i32	$a2,$a2,$t0
781	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
782
783	vadd.i32	$b0,$b0,$t1
784	vadd.i32	$b1,$b1,$t1
785	vadd.i32	$b2,$b2,$t1
786	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
787
788	vadd.i32	$c0,$c0,$t2
789	vadd.i32	$c1,$c1,$t2
790	vadd.i32	$c2,$c2,$t2
791	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
792	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
793
794	vadd.i32	$d0,$d0,$t3
795	vadd.i32	$d1,$d1,$t3
796	vadd.i32	$d2,$d2,$t3
797
798	cmp		@t[3],#64*4
799	blo		.Ltail_neon
800
801	vld1.8		{$t0-$t1},[r12]!	@ load input
802	 mov		@t[3],sp
803	vld1.8		{$t2-$t3},[r12]!
804	veor		$a0,$a0,$t0		@ xor with input
805	veor		$b0,$b0,$t1
806	vld1.8		{$t0-$t1},[r12]!
807	veor		$c0,$c0,$t2
808	veor		$d0,$d0,$t3
809	vld1.8		{$t2-$t3},[r12]!
810
811	veor		$a1,$a1,$t0
812	 vst1.8		{$a0-$b0},[r14]!	@ store output
813	veor		$b1,$b1,$t1
814	vld1.8		{$t0-$t1},[r12]!
815	veor		$c1,$c1,$t2
816	 vst1.8		{$c0-$d0},[r14]!
817	veor		$d1,$d1,$t3
818	vld1.8		{$t2-$t3},[r12]!
819
820	veor		$a2,$a2,$t0
821	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
822	 veor		$t0#hi,$t0#hi,$t0#hi
823	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
824	veor		$b2,$b2,$t1
825	 vld1.32	{$c0-$d0},[@t[3]]
826	veor		$c2,$c2,$t2
827	 vst1.8		{$a1-$b1},[r14]!
828	veor		$d2,$d2,$t3
829	 vst1.8		{$c1-$d1},[r14]!
830
831	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
832	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
833
834	ldmia		sp,{@t[0]-@t[3]}	@ load key material
835	add		@x[0],@x[0],@t[0]	@ accumulate key material
836	ldr		@t[0],[r12],#16		@ load input
837	 vst1.8		{$a2-$b2},[r14]!
838	add		@x[1],@x[1],@t[1]
839	ldr		@t[1],[r12,#-12]
840	 vst1.8		{$c2-$d2},[r14]!
841	add		@x[2],@x[2],@t[2]
842	ldr		@t[2],[r12,#-8]
843	add		@x[3],@x[3],@t[3]
844	ldr		@t[3],[r12,#-4]
845# ifdef	__ARMEB__
846	rev		@x[0],@x[0]
847	rev		@x[1],@x[1]
848	rev		@x[2],@x[2]
849	rev		@x[3],@x[3]
850# endif
851	eor		@x[0],@x[0],@t[0]	@ xor with input
852	 add		@t[0],sp,#4*(4)
853	eor		@x[1],@x[1],@t[1]
854	str		@x[0],[r14],#16		@ store output
855	eor		@x[2],@x[2],@t[2]
856	str		@x[1],[r14,#-12]
857	eor		@x[3],@x[3],@t[3]
858	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
859	str		@x[2],[r14,#-8]
860	str		@x[3],[r14,#-4]
861
862	add		@x[4],@x[4],@t[0]	@ accumulate key material
863	ldr		@t[0],[r12],#16		@ load input
864	add		@x[5],@x[5],@t[1]
865	ldr		@t[1],[r12,#-12]
866	add		@x[6],@x[6],@t[2]
867	ldr		@t[2],[r12,#-8]
868	add		@x[7],@x[7],@t[3]
869	ldr		@t[3],[r12,#-4]
870# ifdef	__ARMEB__
871	rev		@x[4],@x[4]
872	rev		@x[5],@x[5]
873	rev		@x[6],@x[6]
874	rev		@x[7],@x[7]
875# endif
876	eor		@x[4],@x[4],@t[0]
877	 add		@t[0],sp,#4*(8)
878	eor		@x[5],@x[5],@t[1]
879	str		@x[4],[r14],#16		@ store output
880	eor		@x[6],@x[6],@t[2]
881	str		@x[5],[r14,#-12]
882	eor		@x[7],@x[7],@t[3]
883	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
884	str		@x[6],[r14,#-8]
885	 add		@x[0],sp,#4*(16+8)
886	str		@x[7],[r14,#-4]
887
888	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
889
890	add		@x[0],@x[0],@t[0]	@ accumulate key material
891	ldr		@t[0],[r12],#16		@ load input
892	add		@x[1],@x[1],@t[1]
893	ldr		@t[1],[r12,#-12]
894# ifdef	__thumb2__
895	it	hi
896# endif
897	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
898	add		@x[2],@x[2],@t[2]
899	ldr		@t[2],[r12,#-8]
900# ifdef	__thumb2__
901	it	hi
902# endif
903	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
904	add		@x[3],@x[3],@t[3]
905	ldr		@t[3],[r12,#-4]
906# ifdef	__ARMEB__
907	rev		@x[0],@x[0]
908	rev		@x[1],@x[1]
909	rev		@x[2],@x[2]
910	rev		@x[3],@x[3]
911# endif
912	eor		@x[0],@x[0],@t[0]
913	 add		@t[0],sp,#4*(12)
914	eor		@x[1],@x[1],@t[1]
915	str		@x[0],[r14],#16		@ store output
916	eor		@x[2],@x[2],@t[2]
917	str		@x[1],[r14,#-12]
918	eor		@x[3],@x[3],@t[3]
919	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
920	str		@x[2],[r14,#-8]
921	str		@x[3],[r14,#-4]
922
923	add		@x[4],@x[4],@t[0]	@ accumulate key material
924	 add		@t[0],@t[0],#4		@ next counter value
925	add		@x[5],@x[5],@t[1]
926	 str		@t[0],[sp,#4*(12)]	@ save next counter value
927	ldr		@t[0],[r12],#16		@ load input
928	add		@x[6],@x[6],@t[2]
929	 add		@x[4],@x[4],#3		@ counter+3
930	ldr		@t[1],[r12,#-12]
931	add		@x[7],@x[7],@t[3]
932	ldr		@t[2],[r12,#-8]
933	ldr		@t[3],[r12,#-4]
934# ifdef	__ARMEB__
935	rev		@x[4],@x[4]
936	rev		@x[5],@x[5]
937	rev		@x[6],@x[6]
938	rev		@x[7],@x[7]
939# endif
940	eor		@x[4],@x[4],@t[0]
941# ifdef	__thumb2__
942	it	hi
943# endif
944	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
945	eor		@x[5],@x[5],@t[1]
946	eor		@x[6],@x[6],@t[2]
947	str		@x[4],[r14],#16		@ store output
948	eor		@x[7],@x[7],@t[3]
949	str		@x[5],[r14,#-12]
950	 sub		@t[3],@t[0],#64*4	@ len-=64*4
951	str		@x[6],[r14,#-8]
952	str		@x[7],[r14,#-4]
953	bhi		.Loop_neon_outer
954
955	b		.Ldone_neon
956
957.align	4
958.Lbreak_neon:
959	@ harmonize NEON and integer-only stack frames: load data
960	@ from NEON frame, but save to integer-only one; distance
961	@ between the two is 4*(32+4+16-32)=4*(20).
962
963	str		@t[3], [sp,#4*(20+32+2)]	@ save len
964	 add		@t[3],sp,#4*(32+4)
965	str		r12,   [sp,#4*(20+32+1)]	@ save inp
966	str		r14,   [sp,#4*(20+32+0)]	@ save out
967
968	ldr		@x[12],[sp,#4*(16+10)]
969	ldr		@x[14],[sp,#4*(16+11)]
970	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
971	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
972	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
973
974	ldr		@t[3], [sp,#4*(15)]
975	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
976	ldr		@t[2], [sp,#4*(13)]
977	ldr		@x[14],[sp,#4*(14)]
978	str		@t[3], [sp,#4*(20+16+15)]
979	add		@t[3],sp,#4*(20)
980	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
981	add		sp,sp,#4*(20)			@ switch frame
982	vst1.32		{$c0-$d0},[@t[3]]
983	mov		@t[3],#10
984	b		.Loop				@ go integer-only
985
986.align	4
987.Ltail_neon:
988	cmp		@t[3],#64*3
989	bhs		.L192_or_more_neon
990	cmp		@t[3],#64*2
991	bhs		.L128_or_more_neon
992	cmp		@t[3],#64*1
993	bhs		.L64_or_more_neon
994
995	add		@t[0],sp,#4*(8)
996	vst1.8		{$a0-$b0},[sp]
997	add		@t[2],sp,#4*(0)
998	vst1.8		{$c0-$d0},[@t[0]]
999	b		.Loop_tail_neon
1000
1001.align	4
1002.L64_or_more_neon:
1003	vld1.8		{$t0-$t1},[r12]!
1004	vld1.8		{$t2-$t3},[r12]!
1005	veor		$a0,$a0,$t0
1006	veor		$b0,$b0,$t1
1007	veor		$c0,$c0,$t2
1008	veor		$d0,$d0,$t3
1009	vst1.8		{$a0-$b0},[r14]!
1010	vst1.8		{$c0-$d0},[r14]!
1011
1012	beq		.Ldone_neon
1013
1014	add		@t[0],sp,#4*(8)
1015	vst1.8		{$a1-$b1},[sp]
1016	add		@t[2],sp,#4*(0)
1017	vst1.8		{$c1-$d1},[@t[0]]
1018	sub		@t[3],@t[3],#64*1	@ len-=64*1
1019	b		.Loop_tail_neon
1020
1021.align	4
1022.L128_or_more_neon:
1023	vld1.8		{$t0-$t1},[r12]!
1024	vld1.8		{$t2-$t3},[r12]!
1025	veor		$a0,$a0,$t0
1026	veor		$b0,$b0,$t1
1027	vld1.8		{$t0-$t1},[r12]!
1028	veor		$c0,$c0,$t2
1029	veor		$d0,$d0,$t3
1030	vld1.8		{$t2-$t3},[r12]!
1031
1032	veor		$a1,$a1,$t0
1033	veor		$b1,$b1,$t1
1034	 vst1.8		{$a0-$b0},[r14]!
1035	veor		$c1,$c1,$t2
1036	 vst1.8		{$c0-$d0},[r14]!
1037	veor		$d1,$d1,$t3
1038	vst1.8		{$a1-$b1},[r14]!
1039	vst1.8		{$c1-$d1},[r14]!
1040
1041	beq		.Ldone_neon
1042
1043	add		@t[0],sp,#4*(8)
1044	vst1.8		{$a2-$b2},[sp]
1045	add		@t[2],sp,#4*(0)
1046	vst1.8		{$c2-$d2},[@t[0]]
1047	sub		@t[3],@t[3],#64*2	@ len-=64*2
1048	b		.Loop_tail_neon
1049
1050.align	4
1051.L192_or_more_neon:
1052	vld1.8		{$t0-$t1},[r12]!
1053	vld1.8		{$t2-$t3},[r12]!
1054	veor		$a0,$a0,$t0
1055	veor		$b0,$b0,$t1
1056	vld1.8		{$t0-$t1},[r12]!
1057	veor		$c0,$c0,$t2
1058	veor		$d0,$d0,$t3
1059	vld1.8		{$t2-$t3},[r12]!
1060
1061	veor		$a1,$a1,$t0
1062	veor		$b1,$b1,$t1
1063	vld1.8		{$t0-$t1},[r12]!
1064	veor		$c1,$c1,$t2
1065	 vst1.8		{$a0-$b0},[r14]!
1066	veor		$d1,$d1,$t3
1067	vld1.8		{$t2-$t3},[r12]!
1068
1069	veor		$a2,$a2,$t0
1070	 vst1.8		{$c0-$d0},[r14]!
1071	veor		$b2,$b2,$t1
1072	 vst1.8		{$a1-$b1},[r14]!
1073	veor		$c2,$c2,$t2
1074	 vst1.8		{$c1-$d1},[r14]!
1075	veor		$d2,$d2,$t3
1076	vst1.8		{$a2-$b2},[r14]!
1077	vst1.8		{$c2-$d2},[r14]!
1078
1079	beq		.Ldone_neon
1080
1081	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1082	add		@x[0],@x[0],@t[0]	@ accumulate key material
1083	 add		@t[0],sp,#4*(4)
1084	add		@x[1],@x[1],@t[1]
1085	add		@x[2],@x[2],@t[2]
1086	add		@x[3],@x[3],@t[3]
1087	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1088
1089	add		@x[4],@x[4],@t[0]	@ accumulate key material
1090	 add		@t[0],sp,#4*(8)
1091	add		@x[5],@x[5],@t[1]
1092	add		@x[6],@x[6],@t[2]
1093	add		@x[7],@x[7],@t[3]
1094	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1095# ifdef	__ARMEB__
1096	rev		@x[0],@x[0]
1097	rev		@x[1],@x[1]
1098	rev		@x[2],@x[2]
1099	rev		@x[3],@x[3]
1100	rev		@x[4],@x[4]
1101	rev		@x[5],@x[5]
1102	rev		@x[6],@x[6]
1103	rev		@x[7],@x[7]
1104# endif
1105	stmia		sp,{@x[0]-@x[7]}
1106	 add		@x[0],sp,#4*(16+8)
1107
1108	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1109
1110	add		@x[0],@x[0],@t[0]	@ accumulate key material
1111	 add		@t[0],sp,#4*(12)
1112	add		@x[1],@x[1],@t[1]
1113	add		@x[2],@x[2],@t[2]
1114	add		@x[3],@x[3],@t[3]
1115	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1116
1117	add		@x[4],@x[4],@t[0]	@ accumulate key material
1118	 add		@t[0],sp,#4*(8)
1119	add		@x[5],@x[5],@t[1]
1120	 add		@x[4],@x[4],#3		@ counter+3
1121	add		@x[6],@x[6],@t[2]
1122	add		@x[7],@x[7],@t[3]
1123	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1124# ifdef	__ARMEB__
1125	rev		@x[0],@x[0]
1126	rev		@x[1],@x[1]
1127	rev		@x[2],@x[2]
1128	rev		@x[3],@x[3]
1129	rev		@x[4],@x[4]
1130	rev		@x[5],@x[5]
1131	rev		@x[6],@x[6]
1132	rev		@x[7],@x[7]
1133# endif
1134	stmia		@t[0],{@x[0]-@x[7]}
1135	 add		@t[2],sp,#4*(0)
1136	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1137
1138.Loop_tail_neon:
1139	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1140	ldrb		@t[1],[r12],#1		@ read input
1141	subs		@t[3],@t[3],#1
1142	eor		@t[0],@t[0],@t[1]
1143	strb		@t[0],[r14],#1		@ store output
1144	bne		.Loop_tail_neon
1145
1146.Ldone_neon:
1147	add		sp,sp,#4*(32+4)
1148	vldmia		sp,{d8-d15}
1149	add		sp,sp,#4*(16+3)
1150	ldmia		sp!,{r4-r11,pc}
1151.size	ChaCha20_neon,.-ChaCha20_neon
1152.comm	OPENSSL_armcap_P,4,4
1153#endif
1154___
1155}}}
1156
1157foreach (split("\n",$code)) {
1158	s/\`([^\`]*)\`/eval $1/geo;
1159
1160	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1161
1162	print $_,"\n";
1163}
1164close STDOUT;
1165