• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45#		r=1088(*)
46#
47# Cortex-A53	13
48# Cortex-A57	12
49# X-Gene	14
50# Mongoose	10
51# Kryo		12
52# Denver	7.8
53# Apple A7	7.2
54# ThunderX2	9.7
55#
56# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
57#	because they vary too much from compiler to compiler. Newer
58#	compiler does much better and improvement varies from 5% on
59#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60#	compiler this code is at least 2x faster...
61
62# $output is the last argument if it looks like a file (it has an extension)
63# $flavour is the first argument if it doesn't look like a file
64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66
67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70die "can't locate arm-xlate.pl";
71
72open OUT,"| \"$^X\" $xlate $flavour \"$output\""
73    or die "can't call $xlate: $!";
74*STDOUT=*OUT;
75
76my @rhotates = ([  0,  1, 62, 28, 27 ],
77                [ 36, 44,  6, 55, 20 ],
78                [  3, 10, 43, 25, 39 ],
79                [ 41, 45, 15, 21,  8 ],
80                [ 18,  2, 61, 56, 14 ]);
81
82$code.=<<___;
83.rodata
84
85.align 8	// strategic alignment and padding that allows to use
86		// address value as loop termination condition...
87	.quad	0,0,0,0,0,0,0,0
88.type	iotas,%object
89iotas:
90	.quad	0x0000000000000001
91	.quad	0x0000000000008082
92	.quad	0x800000000000808a
93	.quad	0x8000000080008000
94	.quad	0x000000000000808b
95	.quad	0x0000000080000001
96	.quad	0x8000000080008081
97	.quad	0x8000000000008009
98	.quad	0x000000000000008a
99	.quad	0x0000000000000088
100	.quad	0x0000000080008009
101	.quad	0x000000008000000a
102	.quad	0x000000008000808b
103	.quad	0x800000000000008b
104	.quad	0x8000000000008089
105	.quad	0x8000000000008003
106	.quad	0x8000000000008002
107	.quad	0x8000000000000080
108	.quad	0x000000000000800a
109	.quad	0x800000008000000a
110	.quad	0x8000000080008081
111	.quad	0x8000000000008080
112	.quad	0x0000000080000001
113	.quad	0x8000000080008008
114.size	iotas,.-iotas
115___
116								{{{
117my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
118            (0, 5, 10, 15, 20));
119   $A[3][3] = "x25"; # x18 is reserved
120
121my @C = map("x$_", (26,27,28,30));
122
123$code.=<<___;
124.text
125
126.type	KeccakF1600_int,%function
127.align	5
128KeccakF1600_int:
129	adrp	$C[2],iotas
130	add	$C[2],$C[2],:lo12:iotas
131	.inst	0xd503233f			// paciasp
132	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
133	b	.Loop
134.align	4
135.Loop:
136	////////////////////////////////////////// Theta
137	eor	$C[0],$A[0][0],$A[1][0]
138	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
139	eor	$C[1],$A[0][1],$A[1][1]
140	eor	$C[2],$A[0][2],$A[1][2]
141	eor	$C[3],$A[0][3],$A[1][3]
142___
143	$C[4]=$A[0][4];
144	$C[5]=$A[1][4];
145$code.=<<___;
146	eor	$C[4],$A[0][4],$A[1][4]
147	eor	$C[0],$C[0],$A[2][0]
148	eor	$C[1],$C[1],$A[2][1]
149	eor	$C[2],$C[2],$A[2][2]
150	eor	$C[3],$C[3],$A[2][3]
151	eor	$C[4],$C[4],$A[2][4]
152	eor	$C[0],$C[0],$A[3][0]
153	eor	$C[1],$C[1],$A[3][1]
154	eor	$C[2],$C[2],$A[3][2]
155	eor	$C[3],$C[3],$A[3][3]
156	eor	$C[4],$C[4],$A[3][4]
157	eor	$C[0],$C[0],$A[4][0]
158	eor	$C[2],$C[2],$A[4][2]
159	eor	$C[1],$C[1],$A[4][1]
160	eor	$C[3],$C[3],$A[4][3]
161	eor	$C[4],$C[4],$A[4][4]
162
163	eor	$C[5],$C[0],$C[2],ror#63
164
165	eor	$A[0][1],$A[0][1],$C[5]
166	eor	$A[1][1],$A[1][1],$C[5]
167	eor	$A[2][1],$A[2][1],$C[5]
168	eor	$A[3][1],$A[3][1],$C[5]
169	eor	$A[4][1],$A[4][1],$C[5]
170
171	eor	$C[5],$C[1],$C[3],ror#63
172	eor	$C[2],$C[2],$C[4],ror#63
173	eor	$C[3],$C[3],$C[0],ror#63
174	eor	$C[4],$C[4],$C[1],ror#63
175
176	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
177	eor	$A[1][2],$A[1][2],$C[5]
178	eor	$A[2][2],$A[2][2],$C[5]
179	eor	$A[3][2],$A[3][2],$C[5]
180	eor	$A[4][2],$A[4][2],$C[5]
181
182	eor	$A[0][0],$A[0][0],$C[4]
183	eor	$A[1][0],$A[1][0],$C[4]
184	eor	$A[2][0],$A[2][0],$C[4]
185	eor	$A[3][0],$A[3][0],$C[4]
186	eor	$A[4][0],$A[4][0],$C[4]
187___
188	$C[4]=undef;
189	$C[5]=undef;
190$code.=<<___;
191	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
192	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
193	eor	$A[1][3],$A[1][3],$C[2]
194	eor	$A[2][3],$A[2][3],$C[2]
195	eor	$A[3][3],$A[3][3],$C[2]
196	eor	$A[4][3],$A[4][3],$C[2]
197
198	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
199	eor	$A[1][4],$A[1][4],$C[3]
200	eor	$A[2][4],$A[2][4],$C[3]
201	eor	$A[3][4],$A[3][4],$C[3]
202	eor	$A[4][4],$A[4][4],$C[3]
203
204	////////////////////////////////////////// Rho+Pi
205	mov	$C[3],$A[0][1]
206	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
207	//mov	$C[1],$A[0][2]
208	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
209	//mov	$C[0],$A[0][3]
210	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
211	//mov	$C[2],$A[0][4]
212	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
213
214	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
215	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
216	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
217	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
218
219	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
220	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
221	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
222	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
223
224	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
225	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
226	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
227	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
228
229	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
230	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
231	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
232	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
233
234	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
235	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
236	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
237	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
238
239	////////////////////////////////////////// Chi+Iota
240	bic	$C[0],$A[0][2],$A[0][1]
241	bic	$C[1],$A[0][3],$A[0][2]
242	bic	$C[2],$A[0][0],$A[0][4]
243	bic	$C[3],$A[0][1],$A[0][0]
244	eor	$A[0][0],$A[0][0],$C[0]
245	bic	$C[0],$A[0][4],$A[0][3]
246	eor	$A[0][1],$A[0][1],$C[1]
247	 ldr	$C[1],[sp,#16]
248	eor	$A[0][3],$A[0][3],$C[2]
249	eor	$A[0][4],$A[0][4],$C[3]
250	eor	$A[0][2],$A[0][2],$C[0]
251	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
252
253	bic	$C[0],$A[1][2],$A[1][1]
254	 tst	$C[1],#255			// are we done?
255	 str	$C[1],[sp,#16]
256	bic	$C[1],$A[1][3],$A[1][2]
257	bic	$C[2],$A[1][0],$A[1][4]
258	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
259	bic	$C[3],$A[1][1],$A[1][0]
260	eor	$A[1][0],$A[1][0],$C[0]
261	bic	$C[0],$A[1][4],$A[1][3]
262	eor	$A[1][1],$A[1][1],$C[1]
263	eor	$A[1][3],$A[1][3],$C[2]
264	eor	$A[1][4],$A[1][4],$C[3]
265	eor	$A[1][2],$A[1][2],$C[0]
266
267	bic	$C[0],$A[2][2],$A[2][1]
268	bic	$C[1],$A[2][3],$A[2][2]
269	bic	$C[2],$A[2][0],$A[2][4]
270	bic	$C[3],$A[2][1],$A[2][0]
271	eor	$A[2][0],$A[2][0],$C[0]
272	bic	$C[0],$A[2][4],$A[2][3]
273	eor	$A[2][1],$A[2][1],$C[1]
274	eor	$A[2][3],$A[2][3],$C[2]
275	eor	$A[2][4],$A[2][4],$C[3]
276	eor	$A[2][2],$A[2][2],$C[0]
277
278	bic	$C[0],$A[3][2],$A[3][1]
279	bic	$C[1],$A[3][3],$A[3][2]
280	bic	$C[2],$A[3][0],$A[3][4]
281	bic	$C[3],$A[3][1],$A[3][0]
282	eor	$A[3][0],$A[3][0],$C[0]
283	bic	$C[0],$A[3][4],$A[3][3]
284	eor	$A[3][1],$A[3][1],$C[1]
285	eor	$A[3][3],$A[3][3],$C[2]
286	eor	$A[3][4],$A[3][4],$C[3]
287	eor	$A[3][2],$A[3][2],$C[0]
288
289	bic	$C[0],$A[4][2],$A[4][1]
290	bic	$C[1],$A[4][3],$A[4][2]
291	bic	$C[2],$A[4][0],$A[4][4]
292	bic	$C[3],$A[4][1],$A[4][0]
293	eor	$A[4][0],$A[4][0],$C[0]
294	bic	$C[0],$A[4][4],$A[4][3]
295	eor	$A[4][1],$A[4][1],$C[1]
296	eor	$A[4][3],$A[4][3],$C[2]
297	eor	$A[4][4],$A[4][4],$C[3]
298	eor	$A[4][2],$A[4][2],$C[0]
299
300	bne	.Loop
301
302	ldr	x30,[sp,#24]
303	.inst	0xd50323bf			// autiasp
304	ret
305.size	KeccakF1600_int,.-KeccakF1600_int
306
307.type	KeccakF1600,%function
308.align	5
309KeccakF1600:
310	.inst	0xd503233f			// paciasp
311	stp	x29,x30,[sp,#-128]!
312	add	x29,sp,#0
313	stp	x19,x20,[sp,#16]
314	stp	x21,x22,[sp,#32]
315	stp	x23,x24,[sp,#48]
316	stp	x25,x26,[sp,#64]
317	stp	x27,x28,[sp,#80]
318	sub	sp,sp,#48
319
320	str	x0,[sp,#32]			// offload argument
321	mov	$C[0],x0
322	ldp	$A[0][0],$A[0][1],[x0,#16*0]
323	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
324	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
325	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
326	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
327	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
328	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
329	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
330	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
331	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
332	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
333	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
334	ldr	$A[4][4],[$C[0],#16*12]
335
336	bl	KeccakF1600_int
337
338	ldr	$C[0],[sp,#32]
339	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
340	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
341	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
342	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
343	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
344	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
345	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
346	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
347	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
348	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
349	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
350	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
351	str	$A[4][4],[$C[0],#16*12]
352
353	ldp	x19,x20,[x29,#16]
354	add	sp,sp,#48
355	ldp	x21,x22,[x29,#32]
356	ldp	x23,x24,[x29,#48]
357	ldp	x25,x26,[x29,#64]
358	ldp	x27,x28,[x29,#80]
359	ldp	x29,x30,[sp],#128
360	.inst	0xd50323bf			// autiasp
361	ret
362.size	KeccakF1600,.-KeccakF1600
363
364.globl	SHA3_absorb
365.type	SHA3_absorb,%function
366.align	5
367SHA3_absorb:
368	.inst	0xd503233f			// paciasp
369	stp	x29,x30,[sp,#-128]!
370	add	x29,sp,#0
371	stp	x19,x20,[sp,#16]
372	stp	x21,x22,[sp,#32]
373	stp	x23,x24,[sp,#48]
374	stp	x25,x26,[sp,#64]
375	stp	x27,x28,[sp,#80]
376	sub	sp,sp,#64
377
378	stp	x0,x1,[sp,#32]			// offload arguments
379	stp	x2,x3,[sp,#48]
380
381	mov	$C[0],x0			// uint64_t A[5][5]
382	mov	$C[1],x1			// const void *inp
383	mov	$C[2],x2			// size_t len
384	mov	$C[3],x3			// size_t bsz
385	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
386	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
387	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
388	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
389	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
390	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
391	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
392	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
393	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
394	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
395	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
396	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
397	ldr	$A[4][4],[$C[0],#16*12]
398	b	.Loop_absorb
399
400.align	4
401.Loop_absorb:
402	subs	$C[0],$C[2],$C[3]		// len - bsz
403	blo	.Labsorbed
404
405	str	$C[0],[sp,#48]			// save len - bsz
406___
407for (my $i=0; $i<24; $i+=2) {
408my $j = $i+1;
409$code.=<<___;
410	ldr	$C[0],[$C[1]],#8		// *inp++
411#ifdef	__AARCH64EB__
412	rev	$C[0],$C[0]
413#endif
414	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
415	cmp	$C[3],#8*($i+2)
416	blo	.Lprocess_block
417	ldr	$C[0],[$C[1]],#8		// *inp++
418#ifdef	__AARCH64EB__
419	rev	$C[0],$C[0]
420#endif
421	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
422	beq	.Lprocess_block
423___
424}
425$code.=<<___;
426	ldr	$C[0],[$C[1]],#8		// *inp++
427#ifdef	__AARCH64EB__
428	rev	$C[0],$C[0]
429#endif
430	eor	$A[4][4],$A[4][4],$C[0]
431
432.Lprocess_block:
433	str	$C[1],[sp,#40]			// save inp
434
435	bl	KeccakF1600_int
436
437	ldr	$C[1],[sp,#40]			// restore arguments
438	ldp	$C[2],$C[3],[sp,#48]
439	b	.Loop_absorb
440
441.align	4
442.Labsorbed:
443	ldr	$C[1],[sp,#32]
444	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
445	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
446	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
447	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
448	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
449	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
450	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
451	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
452	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
453	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
454	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
455	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
456	str	$A[4][4],[$C[1],#16*12]
457
458	mov	x0,$C[2]			// return value
459	ldp	x19,x20,[x29,#16]
460	add	sp,sp,#64
461	ldp	x21,x22,[x29,#32]
462	ldp	x23,x24,[x29,#48]
463	ldp	x25,x26,[x29,#64]
464	ldp	x27,x28,[x29,#80]
465	ldp	x29,x30,[sp],#128
466	.inst	0xd50323bf			// autiasp
467	ret
468.size	SHA3_absorb,.-SHA3_absorb
469___
470{
471my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
472$code.=<<___;
473.globl	SHA3_squeeze
474.type	SHA3_squeeze,%function
475.align	5
476SHA3_squeeze:
477	.inst	0xd503233f			// paciasp
478	stp	x29,x30,[sp,#-48]!
479	add	x29,sp,#0
480	stp	x19,x20,[sp,#16]
481	stp	x21,x22,[sp,#32]
482
483	mov	$A_flat,x0			// put aside arguments
484	mov	$out,x1
485	mov	$len,x2
486	mov	$bsz,x3
487
488.Loop_squeeze:
489	ldr	x4,[x0],#8
490	cmp	$len,#8
491	blo	.Lsqueeze_tail
492#ifdef	__AARCH64EB__
493	rev	x4,x4
494#endif
495	str	x4,[$out],#8
496	subs	$len,$len,#8
497	beq	.Lsqueeze_done
498
499	subs	x3,x3,#8
500	bhi	.Loop_squeeze
501
502	mov	x0,$A_flat
503	bl	KeccakF1600
504	mov	x0,$A_flat
505	mov	x3,$bsz
506	b	.Loop_squeeze
507
508.align	4
509.Lsqueeze_tail:
510	strb	w4,[$out],#1
511	lsr	x4,x4,#8
512	subs	$len,$len,#1
513	beq	.Lsqueeze_done
514	strb	w4,[$out],#1
515	lsr	x4,x4,#8
516	subs	$len,$len,#1
517	beq	.Lsqueeze_done
518	strb	w4,[$out],#1
519	lsr	x4,x4,#8
520	subs	$len,$len,#1
521	beq	.Lsqueeze_done
522	strb	w4,[$out],#1
523	lsr	x4,x4,#8
524	subs	$len,$len,#1
525	beq	.Lsqueeze_done
526	strb	w4,[$out],#1
527	lsr	x4,x4,#8
528	subs	$len,$len,#1
529	beq	.Lsqueeze_done
530	strb	w4,[$out],#1
531	lsr	x4,x4,#8
532	subs	$len,$len,#1
533	beq	.Lsqueeze_done
534	strb	w4,[$out],#1
535
536.Lsqueeze_done:
537	ldp	x19,x20,[sp,#16]
538	ldp	x21,x22,[sp,#32]
539	ldp	x29,x30,[sp],#48
540	.inst	0xd50323bf			// autiasp
541	ret
542.size	SHA3_squeeze,.-SHA3_squeeze
543___
544}								}}}
545								{{{
546my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
547                             "v".($_+3).".16b", "v".($_+4).".16b" ],
548            (0, 5, 10, 15, 20));
549
550my @C = map("v$_.16b", (25..31));
551my @D = @C[4,5,6,2,3];
552
553$code.=<<___;
554.type	KeccakF1600_ce,%function
555.align	5
556KeccakF1600_ce:
557	mov	x9,#24
558	adrp	x10,iotas
559	add	x10,x10,:lo12:iotas
560	b	.Loop_ce
561.align	4
562.Loop_ce:
563	////////////////////////////////////////////////// Theta
564	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
565	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
566	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
567	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
568	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
569	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
570	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
571	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
572	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
573	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]
574
575	rax1	$C[5],$C[0],$C[2]			// D[1]
576	rax1	$C[6],$C[1],$C[3]			// D[2]
577	rax1	$C[2],$C[2],$C[4]			// D[3]
578	rax1	$C[3],$C[3],$C[0]			// D[4]
579	rax1	$C[4],$C[4],$C[1]			// D[0]
580
581	////////////////////////////////////////////////// Theta+Rho+Pi
582	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
583
584	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
585	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
586	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
587	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
588	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
589
590	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
591
592	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
593	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
594	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
595	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
596	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
597
598	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
599
600	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
601	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
602	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
603	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
604	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
605
606	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
607
608	eor	$A[0][0],$A[0][0],$D[0]
609
610	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
611	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
612	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
613	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
614	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
615
616	////////////////////////////////////////////////// Chi+Iota
617	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
618	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
619	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
620	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
621	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]
622
623	ld1r	{$C[1]},[x10],#8
624
625	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
626	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
627	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
628	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
629	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]
630
631	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
632	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
633	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
634	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
635	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]
636
637	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
638	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
639	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
640	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
641	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]
642
643	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
644	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
645	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
646	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
647	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]
648
649	eor	$A[0][0],$A[0][0],$C[1]
650
651	subs	x9,x9,#1
652	bne	.Loop_ce
653
654	ret
655.size	KeccakF1600_ce,.-KeccakF1600_ce
656
657.type	KeccakF1600_cext,%function
658.align	5
659KeccakF1600_cext:
660	.inst	0xd503233f		// paciasp
661	stp	x29,x30,[sp,#-80]!
662	add	x29,sp,#0
663	stp	d8,d9,[sp,#16]		// per ABI requirement
664	stp	d10,d11,[sp,#32]
665	stp	d12,d13,[sp,#48]
666	stp	d14,d15,[sp,#64]
667___
668for($i=0; $i<24; $i+=2) {		# load A[5][5]
669my $j=$i+1;
670$code.=<<___;
671	ldp	d$i,d$j,[x0,#8*$i]
672___
673}
674$code.=<<___;
675	ldr	d24,[x0,#8*$i]
676	bl	KeccakF1600_ce
677	ldr	x30,[sp,#8]
678___
679for($i=0; $i<24; $i+=2) {		# store A[5][5]
680my $j=$i+1;
681$code.=<<___;
682	stp	d$i,d$j,[x0,#8*$i]
683___
684}
685$code.=<<___;
686	str	d24,[x0,#8*$i]
687
688	ldp	d8,d9,[sp,#16]
689	ldp	d10,d11,[sp,#32]
690	ldp	d12,d13,[sp,#48]
691	ldp	d14,d15,[sp,#64]
692	ldr	x29,[sp],#80
693	.inst	0xd50323bf		// autiasp
694	ret
695.size	KeccakF1600_cext,.-KeccakF1600_cext
696___
697
698{
699my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
700
701$code.=<<___;
702.globl	SHA3_absorb_cext
703.type	SHA3_absorb_cext,%function
704.align	5
705SHA3_absorb_cext:
706	.inst	0xd503233f		// paciasp
707	stp	x29,x30,[sp,#-80]!
708	add	x29,sp,#0
709	stp	d8,d9,[sp,#16]		// per ABI requirement
710	stp	d10,d11,[sp,#32]
711	stp	d12,d13,[sp,#48]
712	stp	d14,d15,[sp,#64]
713___
714for($i=0; $i<24; $i+=2) {		# load A[5][5]
715my $j=$i+1;
716$code.=<<___;
717	ldp	d$i,d$j,[x0,#8*$i]
718___
719}
720$code.=<<___;
721	ldr	d24,[x0,#8*$i]
722	b	.Loop_absorb_ce
723
724.align	4
725.Loop_absorb_ce:
726	subs	$len,$len,$bsz		// len - bsz
727	blo	.Labsorbed_ce
728___
729for (my $i=0; $i<24; $i+=2) {
730my $j = $i+1;
731$code.=<<___;
732	ldr	d31,[$inp],#8		// *inp++
733#ifdef	__AARCH64EB__
734	rev64	v31.16b,v31.16b
735#endif
736	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
737	cmp	$bsz,#8*($i+2)
738	blo	.Lprocess_block_ce
739	ldr	d31,[$inp],#8		// *inp++
740#ifdef	__AARCH64EB__
741	rev64	v31.16b,v31.16b
742#endif
743	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
744	beq	.Lprocess_block_ce
745___
746}
747$code.=<<___;
748	ldr	d31,[$inp],#8		// *inp++
749#ifdef	__AARCH64EB__
750	rev64	v31.16b,v31.16b
751#endif
752	eor	$A[4][4],$A[4][4],v31.16b
753
754.Lprocess_block_ce:
755
756	bl	KeccakF1600_ce
757
758	b	.Loop_absorb_ce
759
760.align	4
761.Labsorbed_ce:
762___
763for($i=0; $i<24; $i+=2) {		# store A[5][5]
764my $j=$i+1;
765$code.=<<___;
766	stp	d$i,d$j,[x0,#8*$i]
767___
768}
769$code.=<<___;
770	str	d24,[x0,#8*$i]
771	add	x0,$len,$bsz		// return value
772
773	ldp	d8,d9,[sp,#16]
774	ldp	d10,d11,[sp,#32]
775	ldp	d12,d13,[sp,#48]
776	ldp	d14,d15,[sp,#64]
777	ldp	x29,x30,[sp],#80
778	.inst	0xd50323bf		// autiasp
779	ret
780.size	SHA3_absorb_cext,.-SHA3_absorb_cext
781___
782}
783{
784my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
785$code.=<<___;
786.globl	SHA3_squeeze_cext
787.type	SHA3_squeeze_cext,%function
788.align	5
789SHA3_squeeze_cext:
790	.inst	0xd503233f		// paciasp
791	stp	x29,x30,[sp,#-16]!
792	add	x29,sp,#0
793	mov	x9,$ctx
794	mov	x10,$bsz
795
796.Loop_squeeze_ce:
797	ldr	x4,[x9],#8
798	cmp	$len,#8
799	blo	.Lsqueeze_tail_ce
800#ifdef	__AARCH64EB__
801	rev	x4,x4
802#endif
803	str	x4,[$out],#8
804	beq	.Lsqueeze_done_ce
805
806	sub	$len,$len,#8
807	subs	x10,x10,#8
808	bhi	.Loop_squeeze_ce
809
810	bl	KeccakF1600_cext
811	ldr	x30,[sp,#8]
812	mov	x9,$ctx
813	mov	x10,$bsz
814	b	.Loop_squeeze_ce
815
816.align	4
817.Lsqueeze_tail_ce:
818	strb	w4,[$out],#1
819	lsr	x4,x4,#8
820	subs	$len,$len,#1
821	beq	.Lsqueeze_done_ce
822	strb	w4,[$out],#1
823	lsr	x4,x4,#8
824	subs	$len,$len,#1
825	beq	.Lsqueeze_done_ce
826	strb	w4,[$out],#1
827	lsr	x4,x4,#8
828	subs	$len,$len,#1
829	beq	.Lsqueeze_done_ce
830	strb	w4,[$out],#1
831	lsr	x4,x4,#8
832	subs	$len,$len,#1
833	beq	.Lsqueeze_done_ce
834	strb	w4,[$out],#1
835	lsr	x4,x4,#8
836	subs	$len,$len,#1
837	beq	.Lsqueeze_done_ce
838	strb	w4,[$out],#1
839	lsr	x4,x4,#8
840	subs	$len,$len,#1
841	beq	.Lsqueeze_done_ce
842	strb	w4,[$out],#1
843
844.Lsqueeze_done_ce:
845	ldr	x29,[sp],#16
846	.inst	0xd50323bf		// autiasp
847	ret
848.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
849___
850}								}}}
851$code.=<<___;
852.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
853___
854
855{   my  %opcode = (
856	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
857	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
858
859    sub unsha3 {
860	my ($mnemonic,$arg)=@_;
861
862	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
863	&&
864	sprintf ".inst\t0x%08x\t//%s %s",
865			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
866			$mnemonic,$arg;
867    }
868}
869
870foreach(split("\n",$code)) {
871
872	s/\`([^\`]*)\`/eval($1)/ge;
873
874	m/\bld1r\b/ and s/\.16b/.2d/g	or
875	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
876
877	print $_,"\n";
878}
879
880close STDOUT or die "error closing STDOUT: $!";
881