• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2014
18#
19# ChaCha20 for ARMv4.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24#
25# Cortex-A5		19.3(*)/+95%    21.8        14.1
26# Cortex-A8		10.5(*)/+160%   13.9        6.35
27# Cortex-A9		12.9(**)/+110%  14.3        6.50
28# Cortex-A15		11.0/+40%       16.0        5.00
29# Snapdragon S4		11.5/+125%      13.6        4.90
30#
31# (*)	most "favourable" result for aligned data on little-endian
32#	processor, result for misaligned data is 10-15% lower;
33# (**)	this result is a trade-off: it can be improved by 20%,
34#	but then Snapdragon S4 and Cortex-A8 results get
35#	20-25% worse;
36
37$flavour = shift;
38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41if ($flavour && $flavour ne "void") {
42    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45    die "can't locate arm-xlate.pl";
46
47    open OUT,"| \"$^X\" $xlate $flavour $output";
48    *STDOUT=*OUT;
49} else {
50    open OUT,">$output";
51    *STDOUT=*OUT;
52}
53
54sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
56  my $arg = pop;
57    $arg = "#$arg" if ($arg*1 eq $arg);
58    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
59}
60
61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
62my @t=map("r$_",(8..11));
63
64sub ROUND {
65my ($a0,$b0,$c0,$d0)=@_;
66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69my $odd = $d0&1;
70my ($xc,$xc_) = (@t[0..1]);
71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
72my @ret;
73
74	# Consider order in which variables are addressed by their
75	# index:
76	#
77	#       a   b   c   d
78	#
79	#       0   4   8  12 < even round
80	#       1   5   9  13
81	#       2   6  10  14
82	#       3   7  11  15
83	#       0   5  10  15 < odd round
84	#       1   6  11  12
85	#       2   7   8  13
86	#       3   4   9  14
87	#
88	# 'a', 'b' are permanently allocated in registers, @x[0..7],
89	# while 'c's and pair of 'd's are maintained in memory. If
90	# you observe 'c' column, you'll notice that pair of 'c's is
91	# invariant between rounds. This means that we have to reload
92	# them once per round, in the middle. This is why you'll see
93	# bunch of 'c' stores and loads in the middle, but none in
94	# the beginning or end. If you observe 'd' column, you'll
95	# notice that 15 and 13 are reused in next pair of rounds.
96	# This is why these two are chosen for offloading to memory,
97	# to make loads count more.
98							push @ret,(
99	"&add	(@x[$a0],@x[$a0],@x[$b0])",
100	"&mov	($xd,$xd,'ror#16')",
101	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
102	 "&mov	($xd_,$xd_,'ror#16')",
103	"&eor	($xd,$xd,@x[$a0],'ror#16')",
104	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
105
106	"&add	($xc,$xc,$xd)",
107	"&mov	(@x[$b0],@x[$b0],'ror#20')",
108	 "&add	($xc_,$xc_,$xd_)",
109	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
110	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
111	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
112
113	"&add	(@x[$a0],@x[$a0],@x[$b0])",
114	"&mov	($xd,$xd,'ror#24')",
115	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
116	 "&mov	($xd_,$xd_,'ror#24')",
117	"&eor	($xd,$xd,@x[$a0],'ror#24')",
118	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
119
120	"&add	($xc,$xc,$xd)",
121	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
122							push @ret,(
123	"&str	($xd,'[sp,#4*(16+$d0)]')",
124	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
125							push @ret,(
126	 "&add	($xc_,$xc_,$xd_)",
127	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
128							push @ret,(
129	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
130	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
131							push @ret,(
132	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
133	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
134
135	$xd=@x[$d2]					if (!$odd);
136	$xd_=@x[$d3]					if ($odd);
137							push @ret,(
138	"&str	($xc,'[sp,#4*(16+$c0)]')",
139	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
140	"&add	(@x[$a2],@x[$a2],@x[$b2])",
141	"&mov	($xd,$xd,'ror#16')",
142	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
143	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
144	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
145	 "&mov	($xd_,$xd_,'ror#16')",
146	"&eor	($xd,$xd,@x[$a2],'ror#16')",
147	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
148
149	"&add	($xc,$xc,$xd)",
150	"&mov	(@x[$b2],@x[$b2],'ror#20')",
151	 "&add	($xc_,$xc_,$xd_)",
152	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
153	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
154	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
155
156	"&add	(@x[$a2],@x[$a2],@x[$b2])",
157	"&mov	($xd,$xd,'ror#24')",
158	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
159	 "&mov	($xd_,$xd_,'ror#24')",
160	"&eor	($xd,$xd,@x[$a2],'ror#24')",
161	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
162
163	"&add	($xc,$xc,$xd)",
164	"&mov	(@x[$b2],@x[$b2],'ror#25')",
165	 "&add	($xc_,$xc_,$xd_)",
166	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
167	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
168	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
169
170	@ret;
171}
172
173$code.=<<___;
174#include <openssl/arm_arch.h>
175
176@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
177@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
178.arch  armv7-a
179
180.text
181#if defined(__thumb2__) || defined(__clang__)
182.syntax	unified
183#endif
184#if defined(__thumb2__)
185.thumb
186#else
187.code	32
188#endif
189
190#if defined(__thumb2__) || defined(__clang__)
191#define ldrhsb	ldrbhs
192#endif
193
194.align	5
195.Lsigma:
196.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
197.Lone:
198.long	1,0,0,0
199#if __ARM_MAX_ARCH__>=7
200.LOPENSSL_armcap:
201.word   OPENSSL_armcap_P-.LChaCha20_ctr32
202#else
203.word	-1
204#endif
205
206.globl	ChaCha20_ctr32
207.type	ChaCha20_ctr32,%function
208.align	5
209ChaCha20_ctr32:
210.LChaCha20_ctr32:
211	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
212	stmdb	sp!,{r0-r2,r4-r11,lr}
213#if __ARM_ARCH__<7 && !defined(__thumb2__)
214	sub	r14,pc,#16		@ ChaCha20_ctr32
215#else
216	adr	r14,.LChaCha20_ctr32
217#endif
218	cmp	r2,#0			@ len==0?
219#ifdef	__thumb2__
220	itt	eq
221#endif
222	addeq	sp,sp,#4*3
223	beq	.Lno_data
224#if __ARM_MAX_ARCH__>=7
225	cmp	r2,#192			@ test len
226	bls	.Lshort
227	ldr	r4,[r14,#-32]
228	ldr	r4,[r14,r4]
229# ifdef	__APPLE__
230	ldr	r4,[r4]
231# endif
232	tst	r4,#ARMV7_NEON
233	bne	.LChaCha20_neon
234.Lshort:
235#endif
236	ldmia	r12,{r4-r7}		@ load counter and nonce
237	sub	sp,sp,#4*(16)		@ off-load area
238	sub	r14,r14,#64		@ .Lsigma
239	stmdb	sp!,{r4-r7}		@ copy counter and nonce
240	ldmia	r3,{r4-r11}		@ load key
241	ldmia	r14,{r0-r3}		@ load sigma
242	stmdb	sp!,{r4-r11}		@ copy key
243	stmdb	sp!,{r0-r3}		@ copy sigma
244	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
245	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
246	b	.Loop_outer_enter
247
248.align	4
249.Loop_outer:
250	ldmia	sp,{r0-r9}		@ load key material
251	str	@t[3],[sp,#4*(32+2)]	@ save len
252	str	r12,  [sp,#4*(32+1)]	@ save inp
253	str	r14,  [sp,#4*(32+0)]	@ save out
254.Loop_outer_enter:
255	ldr	@t[3], [sp,#4*(15)]
256	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
257	ldr	@t[2], [sp,#4*(13)]
258	ldr	@x[14],[sp,#4*(14)]
259	str	@t[3], [sp,#4*(16+15)]
260	mov	@t[3],#10
261	b	.Loop
262
263.align	4
264.Loop:
265	subs	@t[3],@t[3],#1
266___
267	foreach (&ROUND(0, 4, 8,12)) { eval; }
268	foreach (&ROUND(0, 5,10,15)) { eval; }
269$code.=<<___;
270	bne	.Loop
271
272	ldr	@t[3],[sp,#4*(32+2)]	@ load len
273
274	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
275	str	@t[1], [sp,#4*(16+9)]
276	str	@x[12],[sp,#4*(16+12)]
277	str	@t[2], [sp,#4*(16+13)]
278	str	@x[14],[sp,#4*(16+14)]
279
280	@ at this point we have first half of 512-bit result in
281	@ @x[0-7] and second half at sp+4*(16+8)
282
283	cmp	@t[3],#64		@ done yet?
284#ifdef	__thumb2__
285	itete	lo
286#endif
287	addlo	r12,sp,#4*(0)		@ shortcut or ...
288	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
289	addlo	r14,sp,#4*(0)		@ shortcut or ...
290	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
291
292	ldr	@t[0],[sp,#4*(0)]	@ load key material
293	ldr	@t[1],[sp,#4*(1)]
294
295#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
296# if __ARM_ARCH__<7
297	orr	@t[2],r12,r14
298	tst	@t[2],#3		@ are input and output aligned?
299	ldr	@t[2],[sp,#4*(2)]
300	bne	.Lunaligned
301	cmp	@t[3],#64		@ restore flags
302# else
303	ldr	@t[2],[sp,#4*(2)]
304# endif
305	ldr	@t[3],[sp,#4*(3)]
306
307	add	@x[0],@x[0],@t[0]	@ accumulate key material
308	add	@x[1],@x[1],@t[1]
309# ifdef	__thumb2__
310	itt	hs
311# endif
312	ldrhs	@t[0],[r12],#16		@ load input
313	ldrhs	@t[1],[r12,#-12]
314
315	add	@x[2],@x[2],@t[2]
316	add	@x[3],@x[3],@t[3]
317# ifdef	__thumb2__
318	itt	hs
319# endif
320	ldrhs	@t[2],[r12,#-8]
321	ldrhs	@t[3],[r12,#-4]
322# if __ARM_ARCH__>=6 && defined(__ARMEB__)
323	rev	@x[0],@x[0]
324	rev	@x[1],@x[1]
325	rev	@x[2],@x[2]
326	rev	@x[3],@x[3]
327# endif
328# ifdef	__thumb2__
329	itt	hs
330# endif
331	eorhs	@x[0],@x[0],@t[0]	@ xor with input
332	eorhs	@x[1],@x[1],@t[1]
333	 add	@t[0],sp,#4*(4)
334	str	@x[0],[r14],#16		@ store output
335# ifdef	__thumb2__
336	itt	hs
337# endif
338	eorhs	@x[2],@x[2],@t[2]
339	eorhs	@x[3],@x[3],@t[3]
340	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
341	str	@x[1],[r14,#-12]
342	str	@x[2],[r14,#-8]
343	str	@x[3],[r14,#-4]
344
345	add	@x[4],@x[4],@t[0]	@ accumulate key material
346	add	@x[5],@x[5],@t[1]
347# ifdef	__thumb2__
348	itt	hs
349# endif
350	ldrhs	@t[0],[r12],#16		@ load input
351	ldrhs	@t[1],[r12,#-12]
352	add	@x[6],@x[6],@t[2]
353	add	@x[7],@x[7],@t[3]
354# ifdef	__thumb2__
355	itt	hs
356# endif
357	ldrhs	@t[2],[r12,#-8]
358	ldrhs	@t[3],[r12,#-4]
359# if __ARM_ARCH__>=6 && defined(__ARMEB__)
360	rev	@x[4],@x[4]
361	rev	@x[5],@x[5]
362	rev	@x[6],@x[6]
363	rev	@x[7],@x[7]
364# endif
365# ifdef	__thumb2__
366	itt	hs
367# endif
368	eorhs	@x[4],@x[4],@t[0]
369	eorhs	@x[5],@x[5],@t[1]
370	 add	@t[0],sp,#4*(8)
371	str	@x[4],[r14],#16		@ store output
372# ifdef	__thumb2__
373	itt	hs
374# endif
375	eorhs	@x[6],@x[6],@t[2]
376	eorhs	@x[7],@x[7],@t[3]
377	str	@x[5],[r14,#-12]
378	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
379	str	@x[6],[r14,#-8]
380	 add	@x[0],sp,#4*(16+8)
381	str	@x[7],[r14,#-4]
382
383	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
384
385	add	@x[0],@x[0],@t[0]	@ accumulate key material
386	add	@x[1],@x[1],@t[1]
387# ifdef	__thumb2__
388	itt	hs
389# endif
390	ldrhs	@t[0],[r12],#16		@ load input
391	ldrhs	@t[1],[r12,#-12]
392# ifdef	__thumb2__
393	itt	hi
394# endif
395	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
396	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
397	add	@x[2],@x[2],@t[2]
398	add	@x[3],@x[3],@t[3]
399# ifdef	__thumb2__
400	itt	hs
401# endif
402	ldrhs	@t[2],[r12,#-8]
403	ldrhs	@t[3],[r12,#-4]
404# if __ARM_ARCH__>=6 && defined(__ARMEB__)
405	rev	@x[0],@x[0]
406	rev	@x[1],@x[1]
407	rev	@x[2],@x[2]
408	rev	@x[3],@x[3]
409# endif
410# ifdef	__thumb2__
411	itt	hs
412# endif
413	eorhs	@x[0],@x[0],@t[0]
414	eorhs	@x[1],@x[1],@t[1]
415	 add	@t[0],sp,#4*(12)
416	str	@x[0],[r14],#16		@ store output
417# ifdef	__thumb2__
418	itt	hs
419# endif
420	eorhs	@x[2],@x[2],@t[2]
421	eorhs	@x[3],@x[3],@t[3]
422	str	@x[1],[r14,#-12]
423	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
424	str	@x[2],[r14,#-8]
425	str	@x[3],[r14,#-4]
426
427	add	@x[4],@x[4],@t[0]	@ accumulate key material
428	add	@x[5],@x[5],@t[1]
429# ifdef	__thumb2__
430	itt	hi
431# endif
432	 addhi	@t[0],@t[0],#1		@ next counter value
433	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
434# ifdef	__thumb2__
435	itt	hs
436# endif
437	ldrhs	@t[0],[r12],#16		@ load input
438	ldrhs	@t[1],[r12,#-12]
439	add	@x[6],@x[6],@t[2]
440	add	@x[7],@x[7],@t[3]
441# ifdef	__thumb2__
442	itt	hs
443# endif
444	ldrhs	@t[2],[r12,#-8]
445	ldrhs	@t[3],[r12,#-4]
446# if __ARM_ARCH__>=6 && defined(__ARMEB__)
447	rev	@x[4],@x[4]
448	rev	@x[5],@x[5]
449	rev	@x[6],@x[6]
450	rev	@x[7],@x[7]
451# endif
452# ifdef	__thumb2__
453	itt	hs
454# endif
455	eorhs	@x[4],@x[4],@t[0]
456	eorhs	@x[5],@x[5],@t[1]
457# ifdef	__thumb2__
458	 it	ne
459# endif
460	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
461# ifdef	__thumb2__
462	itt	hs
463# endif
464	eorhs	@x[6],@x[6],@t[2]
465	eorhs	@x[7],@x[7],@t[3]
466	str	@x[4],[r14],#16		@ store output
467	str	@x[5],[r14,#-12]
468# ifdef	__thumb2__
469	it	hs
470# endif
471	 subhs	@t[3],@t[0],#64		@ len-=64
472	str	@x[6],[r14,#-8]
473	str	@x[7],[r14,#-4]
474	bhi	.Loop_outer
475
476	beq	.Ldone
477# if __ARM_ARCH__<7
478	b	.Ltail
479
480.align	4
481.Lunaligned:				@ unaligned endian-neutral path
482	cmp	@t[3],#64		@ restore flags
483# endif
484#endif
485#if __ARM_ARCH__<7
486	ldr	@t[3],[sp,#4*(3)]
487___
488for ($i=0;$i<16;$i+=4) {
489my $j=$i&0x7;
490
491$code.=<<___	if ($i==4);
492	add	@x[0],sp,#4*(16+8)
493___
494$code.=<<___	if ($i==8);
495	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
496# ifdef	__thumb2__
497	itt	hi
498# endif
499	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
500	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
501___
502$code.=<<___;
503	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
504___
505$code.=<<___	if ($i==12);
506# ifdef	__thumb2__
507	itt	hi
508# endif
509	addhi	@t[0],@t[0],#1			@ next counter value
510	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
511___
512$code.=<<___;
513	add	@x[$j+1],@x[$j+1],@t[1]
514	add	@x[$j+2],@x[$j+2],@t[2]
515# ifdef	__thumb2__
516	itete	lo
517# endif
518	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
519	ldrhsb	@t[0],[r12],#16			@ ... load input
520	eorlo	@t[1],@t[1],@t[1]
521	ldrhsb	@t[1],[r12,#-12]
522
523	add	@x[$j+3],@x[$j+3],@t[3]
524# ifdef	__thumb2__
525	itete	lo
526# endif
527	eorlo	@t[2],@t[2],@t[2]
528	ldrhsb	@t[2],[r12,#-8]
529	eorlo	@t[3],@t[3],@t[3]
530	ldrhsb	@t[3],[r12,#-4]
531
532	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
533	eor	@x[$j+1],@t[1],@x[$j+1]
534# ifdef	__thumb2__
535	itt	hs
536# endif
537	ldrhsb	@t[0],[r12,#-15]		@ load more input
538	ldrhsb	@t[1],[r12,#-11]
539	eor	@x[$j+2],@t[2],@x[$j+2]
540	 strb	@x[$j+0],[r14],#16		@ store output
541	eor	@x[$j+3],@t[3],@x[$j+3]
542# ifdef	__thumb2__
543	itt	hs
544# endif
545	ldrhsb	@t[2],[r12,#-7]
546	ldrhsb	@t[3],[r12,#-3]
547	 strb	@x[$j+1],[r14,#-12]
548	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
549	 strb	@x[$j+2],[r14,#-8]
550	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
551# ifdef	__thumb2__
552	itt	hs
553# endif
554	ldrhsb	@t[0],[r12,#-14]		@ load more input
555	ldrhsb	@t[1],[r12,#-10]
556	 strb	@x[$j+3],[r14,#-4]
557	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
558	 strb	@x[$j+0],[r14,#-15]
559	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
560# ifdef	__thumb2__
561	itt	hs
562# endif
563	ldrhsb	@t[2],[r12,#-6]
564	ldrhsb	@t[3],[r12,#-2]
565	 strb	@x[$j+1],[r14,#-11]
566	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
567	 strb	@x[$j+2],[r14,#-7]
568	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
569# ifdef	__thumb2__
570	itt	hs
571# endif
572	ldrhsb	@t[0],[r12,#-13]		@ load more input
573	ldrhsb	@t[1],[r12,#-9]
574	 strb	@x[$j+3],[r14,#-3]
575	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
576	 strb	@x[$j+0],[r14,#-14]
577	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
578# ifdef	__thumb2__
579	itt	hs
580# endif
581	ldrhsb	@t[2],[r12,#-5]
582	ldrhsb	@t[3],[r12,#-1]
583	 strb	@x[$j+1],[r14,#-10]
584	 strb	@x[$j+2],[r14,#-6]
585	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
586	 strb	@x[$j+3],[r14,#-2]
587	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
588	 strb	@x[$j+0],[r14,#-13]
589	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
590	 strb	@x[$j+1],[r14,#-9]
591	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
592	 strb	@x[$j+2],[r14,#-5]
593	 strb	@x[$j+3],[r14,#-1]
594___
595$code.=<<___	if ($i<12);
596	add	@t[0],sp,#4*(4+$i)
597	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
598___
599}
600$code.=<<___;
601# ifdef	__thumb2__
602	it	ne
603# endif
604	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
605# ifdef	__thumb2__
606	it	hs
607# endif
608	subhs	@t[3],@t[0],#64			@ len-=64
609	bhi	.Loop_outer
610
611	beq	.Ldone
612#endif
613
614.Ltail:
615	ldr	r12,[sp,#4*(32+1)]	@ load inp
616	add	@t[1],sp,#4*(0)
617	ldr	r14,[sp,#4*(32+0)]	@ load out
618
619.Loop_tail:
620	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
621	ldrb	@t[3],[r12],#1		@ read input
622	subs	@t[0],@t[0],#1
623	eor	@t[3],@t[3],@t[2]
624	strb	@t[3],[r14],#1		@ store output
625	bne	.Loop_tail
626
627.Ldone:
628	add	sp,sp,#4*(32+3)
629.Lno_data:
630	ldmia	sp!,{r4-r11,pc}
631.size	ChaCha20_ctr32,.-ChaCha20_ctr32
632___
633
634{{{
635my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
636    map("q$_",(0..15));
637
638sub NEONROUND {
639my $odd = pop;
640my ($a,$b,$c,$d,$t)=@_;
641
642	(
643	"&vadd_i32	($a,$a,$b)",
644	"&veor		($d,$d,$a)",
645	"&vrev32_16	($d,$d)",	# vrot ($d,16)
646
647	"&vadd_i32	($c,$c,$d)",
648	"&veor		($t,$b,$c)",
649	"&vshr_u32	($b,$t,20)",
650	"&vsli_32	($b,$t,12)",
651
652	"&vadd_i32	($a,$a,$b)",
653	"&veor		($t,$d,$a)",
654	"&vshr_u32	($d,$t,24)",
655	"&vsli_32	($d,$t,8)",
656
657	"&vadd_i32	($c,$c,$d)",
658	"&veor		($t,$b,$c)",
659	"&vshr_u32	($b,$t,25)",
660	"&vsli_32	($b,$t,7)",
661
662	"&vext_8	($c,$c,$c,8)",
663	"&vext_8	($b,$b,$b,$odd?12:4)",
664	"&vext_8	($d,$d,$d,$odd?4:12)"
665	);
666}
667
668$code.=<<___;
669#if __ARM_MAX_ARCH__>=7
670.arch	armv7-a
671.fpu	neon
672
673.type	ChaCha20_neon,%function
674.align	5
675ChaCha20_neon:
676	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
677	stmdb		sp!,{r0-r2,r4-r11,lr}
678.LChaCha20_neon:
679	adr		r14,.Lsigma
680	vstmdb		sp!,{d8-d15}		@ ABI spec says so
681	stmdb		sp!,{r0-r3}
682
683	vld1.32		{$b0-$c0},[r3]		@ load key
684	ldmia		r3,{r4-r11}		@ load key
685
686	sub		sp,sp,#4*(16+16)
687	vld1.32		{$d0},[r12]		@ load counter and nonce
688	add		r12,sp,#4*8
689	ldmia		r14,{r0-r3}		@ load sigma
690	vld1.32		{$a0},[r14]!		@ load sigma
691	vld1.32		{$t0},[r14]		@ one
692	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
693	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
694
695	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
696	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
697	vshl.i32	$t1#lo,$t0#lo,#1	@ two
698	vstr		$t0#lo,[sp,#4*(16+0)]
699	vshl.i32	$t2#lo,$t0#lo,#2	@ four
700	vstr		$t1#lo,[sp,#4*(16+2)]
701	vmov		$a1,$a0
702	vstr		$t2#lo,[sp,#4*(16+4)]
703	vmov		$a2,$a0
704	vmov		$b1,$b0
705	vmov		$b2,$b0
706	b		.Loop_neon_enter
707
708.align	4
709.Loop_neon_outer:
710	ldmia		sp,{r0-r9}		@ load key material
711	cmp		@t[3],#64*2		@ if len<=64*2
712	bls		.Lbreak_neon		@ switch to integer-only
713	vmov		$a1,$a0
714	str		@t[3],[sp,#4*(32+2)]	@ save len
715	vmov		$a2,$a0
716	str		r12,  [sp,#4*(32+1)]	@ save inp
717	vmov		$b1,$b0
718	str		r14,  [sp,#4*(32+0)]	@ save out
719	vmov		$b2,$b0
720.Loop_neon_enter:
721	ldr		@t[3], [sp,#4*(15)]
722	vadd.i32	$d1,$d0,$t0		@ counter+1
723	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
724	vmov		$c1,$c0
725	ldr		@t[2], [sp,#4*(13)]
726	vmov		$c2,$c0
727	ldr		@x[14],[sp,#4*(14)]
728	vadd.i32	$d2,$d1,$t0		@ counter+2
729	str		@t[3], [sp,#4*(16+15)]
730	mov		@t[3],#10
731	add		@x[12],@x[12],#3	@ counter+3
732	b		.Loop_neon
733
734.align	4
735.Loop_neon:
736	subs		@t[3],@t[3],#1
737___
738	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
739	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
740	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
741	my @thread3=&ROUND(0,4,8,12);
742
743	foreach (@thread0) {
744		eval;			eval(shift(@thread3));
745		eval(shift(@thread1));	eval(shift(@thread3));
746		eval(shift(@thread2));	eval(shift(@thread3));
747	}
748
749	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
750	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
751	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
752	@thread3=&ROUND(0,5,10,15);
753
754	foreach (@thread0) {
755		eval;			eval(shift(@thread3));
756		eval(shift(@thread1));	eval(shift(@thread3));
757		eval(shift(@thread2));	eval(shift(@thread3));
758	}
759$code.=<<___;
760	bne		.Loop_neon
761
762	add		@t[3],sp,#32
763	vld1.32		{$t0-$t1},[sp]		@ load key material
764	vld1.32		{$t2-$t3},[@t[3]]
765
766	ldr		@t[3],[sp,#4*(32+2)]	@ load len
767
768	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
769	str		@t[1], [sp,#4*(16+9)]
770	str		@x[12],[sp,#4*(16+12)]
771	str		@t[2], [sp,#4*(16+13)]
772	str		@x[14],[sp,#4*(16+14)]
773
774	@ at this point we have first half of 512-bit result in
775	@ @x[0-7] and second half at sp+4*(16+8)
776
777	ldr		r12,[sp,#4*(32+1)]	@ load inp
778	ldr		r14,[sp,#4*(32+0)]	@ load out
779
780	vadd.i32	$a0,$a0,$t0		@ accumulate key material
781	vadd.i32	$a1,$a1,$t0
782	vadd.i32	$a2,$a2,$t0
783	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
784
785	vadd.i32	$b0,$b0,$t1
786	vadd.i32	$b1,$b1,$t1
787	vadd.i32	$b2,$b2,$t1
788	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
789
790	vadd.i32	$c0,$c0,$t2
791	vadd.i32	$c1,$c1,$t2
792	vadd.i32	$c2,$c2,$t2
793	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
794	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
795
796	vadd.i32	$d0,$d0,$t3
797	vadd.i32	$d1,$d1,$t3
798	vadd.i32	$d2,$d2,$t3
799
800	cmp		@t[3],#64*4
801	blo		.Ltail_neon
802
803	vld1.8		{$t0-$t1},[r12]!	@ load input
804	 mov		@t[3],sp
805	vld1.8		{$t2-$t3},[r12]!
806	veor		$a0,$a0,$t0		@ xor with input
807	veor		$b0,$b0,$t1
808	vld1.8		{$t0-$t1},[r12]!
809	veor		$c0,$c0,$t2
810	veor		$d0,$d0,$t3
811	vld1.8		{$t2-$t3},[r12]!
812
813	veor		$a1,$a1,$t0
814	 vst1.8		{$a0-$b0},[r14]!	@ store output
815	veor		$b1,$b1,$t1
816	vld1.8		{$t0-$t1},[r12]!
817	veor		$c1,$c1,$t2
818	 vst1.8		{$c0-$d0},[r14]!
819	veor		$d1,$d1,$t3
820	vld1.8		{$t2-$t3},[r12]!
821
822	veor		$a2,$a2,$t0
823	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
824	 veor		$t0#hi,$t0#hi,$t0#hi
825	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
826	veor		$b2,$b2,$t1
827	 vld1.32	{$c0-$d0},[@t[3]]
828	veor		$c2,$c2,$t2
829	 vst1.8		{$a1-$b1},[r14]!
830	veor		$d2,$d2,$t3
831	 vst1.8		{$c1-$d1},[r14]!
832
833	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
834	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
835
836	ldmia		sp,{@t[0]-@t[3]}	@ load key material
837	add		@x[0],@x[0],@t[0]	@ accumulate key material
838	ldr		@t[0],[r12],#16		@ load input
839	 vst1.8		{$a2-$b2},[r14]!
840	add		@x[1],@x[1],@t[1]
841	ldr		@t[1],[r12,#-12]
842	 vst1.8		{$c2-$d2},[r14]!
843	add		@x[2],@x[2],@t[2]
844	ldr		@t[2],[r12,#-8]
845	add		@x[3],@x[3],@t[3]
846	ldr		@t[3],[r12,#-4]
847# ifdef	__ARMEB__
848	rev		@x[0],@x[0]
849	rev		@x[1],@x[1]
850	rev		@x[2],@x[2]
851	rev		@x[3],@x[3]
852# endif
853	eor		@x[0],@x[0],@t[0]	@ xor with input
854	 add		@t[0],sp,#4*(4)
855	eor		@x[1],@x[1],@t[1]
856	str		@x[0],[r14],#16		@ store output
857	eor		@x[2],@x[2],@t[2]
858	str		@x[1],[r14,#-12]
859	eor		@x[3],@x[3],@t[3]
860	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
861	str		@x[2],[r14,#-8]
862	str		@x[3],[r14,#-4]
863
864	add		@x[4],@x[4],@t[0]	@ accumulate key material
865	ldr		@t[0],[r12],#16		@ load input
866	add		@x[5],@x[5],@t[1]
867	ldr		@t[1],[r12,#-12]
868	add		@x[6],@x[6],@t[2]
869	ldr		@t[2],[r12,#-8]
870	add		@x[7],@x[7],@t[3]
871	ldr		@t[3],[r12,#-4]
872# ifdef	__ARMEB__
873	rev		@x[4],@x[4]
874	rev		@x[5],@x[5]
875	rev		@x[6],@x[6]
876	rev		@x[7],@x[7]
877# endif
878	eor		@x[4],@x[4],@t[0]
879	 add		@t[0],sp,#4*(8)
880	eor		@x[5],@x[5],@t[1]
881	str		@x[4],[r14],#16		@ store output
882	eor		@x[6],@x[6],@t[2]
883	str		@x[5],[r14,#-12]
884	eor		@x[7],@x[7],@t[3]
885	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
886	str		@x[6],[r14,#-8]
887	 add		@x[0],sp,#4*(16+8)
888	str		@x[7],[r14,#-4]
889
890	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
891
892	add		@x[0],@x[0],@t[0]	@ accumulate key material
893	ldr		@t[0],[r12],#16		@ load input
894	add		@x[1],@x[1],@t[1]
895	ldr		@t[1],[r12,#-12]
896# ifdef	__thumb2__
897	it	hi
898# endif
899	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
900	add		@x[2],@x[2],@t[2]
901	ldr		@t[2],[r12,#-8]
902# ifdef	__thumb2__
903	it	hi
904# endif
905	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
906	add		@x[3],@x[3],@t[3]
907	ldr		@t[3],[r12,#-4]
908# ifdef	__ARMEB__
909	rev		@x[0],@x[0]
910	rev		@x[1],@x[1]
911	rev		@x[2],@x[2]
912	rev		@x[3],@x[3]
913# endif
914	eor		@x[0],@x[0],@t[0]
915	 add		@t[0],sp,#4*(12)
916	eor		@x[1],@x[1],@t[1]
917	str		@x[0],[r14],#16		@ store output
918	eor		@x[2],@x[2],@t[2]
919	str		@x[1],[r14,#-12]
920	eor		@x[3],@x[3],@t[3]
921	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
922	str		@x[2],[r14,#-8]
923	str		@x[3],[r14,#-4]
924
925	add		@x[4],@x[4],@t[0]	@ accumulate key material
926	 add		@t[0],@t[0],#4		@ next counter value
927	add		@x[5],@x[5],@t[1]
928	 str		@t[0],[sp,#4*(12)]	@ save next counter value
929	ldr		@t[0],[r12],#16		@ load input
930	add		@x[6],@x[6],@t[2]
931	 add		@x[4],@x[4],#3		@ counter+3
932	ldr		@t[1],[r12,#-12]
933	add		@x[7],@x[7],@t[3]
934	ldr		@t[2],[r12,#-8]
935	ldr		@t[3],[r12,#-4]
936# ifdef	__ARMEB__
937	rev		@x[4],@x[4]
938	rev		@x[5],@x[5]
939	rev		@x[6],@x[6]
940	rev		@x[7],@x[7]
941# endif
942	eor		@x[4],@x[4],@t[0]
943# ifdef	__thumb2__
944	it	hi
945# endif
946	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
947	eor		@x[5],@x[5],@t[1]
948	eor		@x[6],@x[6],@t[2]
949	str		@x[4],[r14],#16		@ store output
950	eor		@x[7],@x[7],@t[3]
951	str		@x[5],[r14,#-12]
952	 sub		@t[3],@t[0],#64*4	@ len-=64*4
953	str		@x[6],[r14,#-8]
954	str		@x[7],[r14,#-4]
955	bhi		.Loop_neon_outer
956
957	b		.Ldone_neon
958
959.align	4
960.Lbreak_neon:
961	@ harmonize NEON and integer-only stack frames: load data
962	@ from NEON frame, but save to integer-only one; distance
963	@ between the two is 4*(32+4+16-32)=4*(20).
964
965	str		@t[3], [sp,#4*(20+32+2)]	@ save len
966	 add		@t[3],sp,#4*(32+4)
967	str		r12,   [sp,#4*(20+32+1)]	@ save inp
968	str		r14,   [sp,#4*(20+32+0)]	@ save out
969
970	ldr		@x[12],[sp,#4*(16+10)]
971	ldr		@x[14],[sp,#4*(16+11)]
972	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
973	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
974	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
975
976	ldr		@t[3], [sp,#4*(15)]
977	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
978	ldr		@t[2], [sp,#4*(13)]
979	ldr		@x[14],[sp,#4*(14)]
980	str		@t[3], [sp,#4*(20+16+15)]
981	add		@t[3],sp,#4*(20)
982	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
983	add		sp,sp,#4*(20)			@ switch frame
984	vst1.32		{$c0-$d0},[@t[3]]
985	mov		@t[3],#10
986	b		.Loop				@ go integer-only
987
988.align	4
989.Ltail_neon:
990	cmp		@t[3],#64*3
991	bhs		.L192_or_more_neon
992	cmp		@t[3],#64*2
993	bhs		.L128_or_more_neon
994	cmp		@t[3],#64*1
995	bhs		.L64_or_more_neon
996
997	add		@t[0],sp,#4*(8)
998	vst1.8		{$a0-$b0},[sp]
999	add		@t[2],sp,#4*(0)
1000	vst1.8		{$c0-$d0},[@t[0]]
1001	b		.Loop_tail_neon
1002
1003.align	4
1004.L64_or_more_neon:
1005	vld1.8		{$t0-$t1},[r12]!
1006	vld1.8		{$t2-$t3},[r12]!
1007	veor		$a0,$a0,$t0
1008	veor		$b0,$b0,$t1
1009	veor		$c0,$c0,$t2
1010	veor		$d0,$d0,$t3
1011	vst1.8		{$a0-$b0},[r14]!
1012	vst1.8		{$c0-$d0},[r14]!
1013
1014	beq		.Ldone_neon
1015
1016	add		@t[0],sp,#4*(8)
1017	vst1.8		{$a1-$b1},[sp]
1018	add		@t[2],sp,#4*(0)
1019	vst1.8		{$c1-$d1},[@t[0]]
1020	sub		@t[3],@t[3],#64*1	@ len-=64*1
1021	b		.Loop_tail_neon
1022
1023.align	4
1024.L128_or_more_neon:
1025	vld1.8		{$t0-$t1},[r12]!
1026	vld1.8		{$t2-$t3},[r12]!
1027	veor		$a0,$a0,$t0
1028	veor		$b0,$b0,$t1
1029	vld1.8		{$t0-$t1},[r12]!
1030	veor		$c0,$c0,$t2
1031	veor		$d0,$d0,$t3
1032	vld1.8		{$t2-$t3},[r12]!
1033
1034	veor		$a1,$a1,$t0
1035	veor		$b1,$b1,$t1
1036	 vst1.8		{$a0-$b0},[r14]!
1037	veor		$c1,$c1,$t2
1038	 vst1.8		{$c0-$d0},[r14]!
1039	veor		$d1,$d1,$t3
1040	vst1.8		{$a1-$b1},[r14]!
1041	vst1.8		{$c1-$d1},[r14]!
1042
1043	beq		.Ldone_neon
1044
1045	add		@t[0],sp,#4*(8)
1046	vst1.8		{$a2-$b2},[sp]
1047	add		@t[2],sp,#4*(0)
1048	vst1.8		{$c2-$d2},[@t[0]]
1049	sub		@t[3],@t[3],#64*2	@ len-=64*2
1050	b		.Loop_tail_neon
1051
1052.align	4
1053.L192_or_more_neon:
1054	vld1.8		{$t0-$t1},[r12]!
1055	vld1.8		{$t2-$t3},[r12]!
1056	veor		$a0,$a0,$t0
1057	veor		$b0,$b0,$t1
1058	vld1.8		{$t0-$t1},[r12]!
1059	veor		$c0,$c0,$t2
1060	veor		$d0,$d0,$t3
1061	vld1.8		{$t2-$t3},[r12]!
1062
1063	veor		$a1,$a1,$t0
1064	veor		$b1,$b1,$t1
1065	vld1.8		{$t0-$t1},[r12]!
1066	veor		$c1,$c1,$t2
1067	 vst1.8		{$a0-$b0},[r14]!
1068	veor		$d1,$d1,$t3
1069	vld1.8		{$t2-$t3},[r12]!
1070
1071	veor		$a2,$a2,$t0
1072	 vst1.8		{$c0-$d0},[r14]!
1073	veor		$b2,$b2,$t1
1074	 vst1.8		{$a1-$b1},[r14]!
1075	veor		$c2,$c2,$t2
1076	 vst1.8		{$c1-$d1},[r14]!
1077	veor		$d2,$d2,$t3
1078	vst1.8		{$a2-$b2},[r14]!
1079	vst1.8		{$c2-$d2},[r14]!
1080
1081	beq		.Ldone_neon
1082
1083	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1084	add		@x[0],@x[0],@t[0]	@ accumulate key material
1085	 add		@t[0],sp,#4*(4)
1086	add		@x[1],@x[1],@t[1]
1087	add		@x[2],@x[2],@t[2]
1088	add		@x[3],@x[3],@t[3]
1089	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1090
1091	add		@x[4],@x[4],@t[0]	@ accumulate key material
1092	 add		@t[0],sp,#4*(8)
1093	add		@x[5],@x[5],@t[1]
1094	add		@x[6],@x[6],@t[2]
1095	add		@x[7],@x[7],@t[3]
1096	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1097# ifdef	__ARMEB__
1098	rev		@x[0],@x[0]
1099	rev		@x[1],@x[1]
1100	rev		@x[2],@x[2]
1101	rev		@x[3],@x[3]
1102	rev		@x[4],@x[4]
1103	rev		@x[5],@x[5]
1104	rev		@x[6],@x[6]
1105	rev		@x[7],@x[7]
1106# endif
1107	stmia		sp,{@x[0]-@x[7]}
1108	 add		@x[0],sp,#4*(16+8)
1109
1110	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1111
1112	add		@x[0],@x[0],@t[0]	@ accumulate key material
1113	 add		@t[0],sp,#4*(12)
1114	add		@x[1],@x[1],@t[1]
1115	add		@x[2],@x[2],@t[2]
1116	add		@x[3],@x[3],@t[3]
1117	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1118
1119	add		@x[4],@x[4],@t[0]	@ accumulate key material
1120	 add		@t[0],sp,#4*(8)
1121	add		@x[5],@x[5],@t[1]
1122	 add		@x[4],@x[4],#3		@ counter+3
1123	add		@x[6],@x[6],@t[2]
1124	add		@x[7],@x[7],@t[3]
1125	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1126# ifdef	__ARMEB__
1127	rev		@x[0],@x[0]
1128	rev		@x[1],@x[1]
1129	rev		@x[2],@x[2]
1130	rev		@x[3],@x[3]
1131	rev		@x[4],@x[4]
1132	rev		@x[5],@x[5]
1133	rev		@x[6],@x[6]
1134	rev		@x[7],@x[7]
1135# endif
1136	stmia		@t[0],{@x[0]-@x[7]}
1137	 add		@t[2],sp,#4*(0)
1138	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1139
1140.Loop_tail_neon:
1141	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1142	ldrb		@t[1],[r12],#1		@ read input
1143	subs		@t[3],@t[3],#1
1144	eor		@t[0],@t[0],@t[1]
1145	strb		@t[0],[r14],#1		@ store output
1146	bne		.Loop_tail_neon
1147
1148.Ldone_neon:
1149	add		sp,sp,#4*(32+4)
1150	vldmia		sp,{d8-d15}
1151	add		sp,sp,#4*(16+3)
1152	ldmia		sp!,{r4-r11,pc}
1153.size	ChaCha20_neon,.-ChaCha20_neon
1154.comm	OPENSSL_armcap_P,4,4
1155#endif
1156___
1157}}}
1158
1159foreach (split("\n",$code)) {
1160	s/\`([^\`]*)\`/eval $1/geo;
1161
1162	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1163
1164	print $_,"\n";
1165}
1166close STDOUT or die "error closing STDOUT";
1167