• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12#
13# This module may be used under the terms of either the GNU General
14# Public License version 2 or later, the GNU Lesser General Public
15# License version 2.1 or later, the Mozilla Public License version
16# 1.1 or the BSD License. The exact terms of either license are
17# distributed along with this module. For further details see
18# http://www.openssl.org/~appro/camellia/.
19# ====================================================================
20
21# Performance in cycles per processed byte (less is better) in
22# 'openssl speed ...' benchmark:
23#
24#			AMD64	Core2	EM64T
25# -evp camellia-128-ecb	16.7	21.0	22.7
26# + over gcc 3.4.6	+25%	+5%	0%
27#
28# camellia-128-cbc	15.7	20.4	21.1
29#
30# 128-bit key setup	128	216	205	cycles/key
31# + over gcc 3.4.6	+54%	+39%	+15%
32#
33# Numbers in "+" rows represent performance improvement over compiler
34# generated code. Key setup timings are impressive on AMD and Core2
35# thanks to 64-bit operations being covertly deployed. Improvement on
36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37# apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39# $output is the last argument if it looks like a file (it has an extension)
40# $flavour is the first argument if it doesn't look like a file
41$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
42$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
43
44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49die "can't locate x86_64-xlate.pl";
50
51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
52    or die "can't call $xlate: $!";
53*STDOUT=*OUT;
54
55sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
56sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
57                        $r =~ s/%[er]([sd]i)/%\1l/;
58                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
59
60$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
61@S=("%r8d","%r9d","%r10d","%r11d");
62$i0="%esi";
63$i1="%edi";
64$Tbl="%rbp";	# size optimization
65$inp="%r12";
66$out="%r13";
67$key="%r14";
68$keyend="%r15";
69$arg0d=$win64?"%ecx":"%edi";
70
71# const unsigned int Camellia_SBOX[4][256];
72# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
73# and [2][] - with [3][]. This is done to minimize code size.
74$SBOX1_1110=0;		# Camellia_SBOX[0]
75$SBOX4_4404=4;		# Camellia_SBOX[1]
76$SBOX2_0222=2048;	# Camellia_SBOX[2]
77$SBOX3_3033=2052;	# Camellia_SBOX[3]
78
79sub Camellia_Feistel {
80my $i=@_[0];
81my $seed=defined(@_[1])?@_[1]:0;
82my $scale=$seed<0?-8:8;
83my $j=($i&1)*2;
84my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
85
86$code.=<<___;
87	xor	$s0,$t0				# t0^=key[0]
88	xor	$s1,$t1				# t1^=key[1]
89	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff
90	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff
91	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]
92	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]
93	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff
94	shr	\$16,$t0
95	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff
96	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]
97	shr	\$16,$t1
98	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]
99	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff
100	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff
101	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]
102	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]
103	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff
104	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff
105	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]
106	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]
107	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]
108	mov	`$seed+($i+1)*$scale+4`($key),$t0
109	xor	$t3,$t2				# t2^=t3
110	ror	\$8,$t3				# t3=RightRotate(t3,8)
111	xor	$t2,$s2
112	xor	$t2,$s3
113	xor	$t3,$s3
114___
115}
116
117# void Camellia_EncryptBlock_Rounds(
118#		int grandRounds,
119#		const Byte plaintext[],
120#		const KEY_TABLE_TYPE keyTable,
121#		Byte ciphertext[])
122$code=<<___;
123.text
124
125# V1.x API
126.globl	Camellia_EncryptBlock
127.type	Camellia_EncryptBlock,\@abi-omnipotent
128.align	16
129Camellia_EncryptBlock:
130.cfi_startproc
131	movl	\$128,%eax
132	subl	$arg0d,%eax
133	movl	\$3,$arg0d
134	adcl	\$0,$arg0d	# keyBitLength==128?3:4
135	jmp	.Lenc_rounds
136.cfi_endproc
137.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock
138# V2
139.globl	Camellia_EncryptBlock_Rounds
140.type	Camellia_EncryptBlock_Rounds,\@function,4
141.align	16
142.Lenc_rounds:
143Camellia_EncryptBlock_Rounds:
144.cfi_startproc
145	push	%rbx
146.cfi_push	%rbx
147	push	%rbp
148.cfi_push	%rbp
149	push	%r13
150.cfi_push	%r13
151	push	%r14
152.cfi_push	%r14
153	push	%r15
154.cfi_push	%r15
155.Lenc_prologue:
156
157	#mov	%rsi,$inp		# put away arguments
158	mov	%rcx,$out
159	mov	%rdx,$key
160
161	shl	\$6,%edi		# process grandRounds
162	lea	.LCamellia_SBOX(%rip),$Tbl
163	lea	($key,%rdi),$keyend
164
165	mov	0(%rsi),@S[0]		# load plaintext
166	mov	4(%rsi),@S[1]
167	mov	8(%rsi),@S[2]
168	bswap	@S[0]
169	mov	12(%rsi),@S[3]
170	bswap	@S[1]
171	bswap	@S[2]
172	bswap	@S[3]
173
174	call	_x86_64_Camellia_encrypt
175
176	bswap	@S[0]
177	bswap	@S[1]
178	bswap	@S[2]
179	mov	@S[0],0($out)
180	bswap	@S[3]
181	mov	@S[1],4($out)
182	mov	@S[2],8($out)
183	mov	@S[3],12($out)
184
185	mov	0(%rsp),%r15
186.cfi_restore	%r15
187	mov	8(%rsp),%r14
188.cfi_restore	%r14
189	mov	16(%rsp),%r13
190.cfi_restore	%r13
191	mov	24(%rsp),%rbp
192.cfi_restore	%rbp
193	mov	32(%rsp),%rbx
194.cfi_restore	%rbx
195	lea	40(%rsp),%rsp
196.cfi_adjust_cfa_offset	-40
197.Lenc_epilogue:
198	ret
199.cfi_endproc
200.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
201
202.type	_x86_64_Camellia_encrypt,\@abi-omnipotent
203.align	16
204_x86_64_Camellia_encrypt:
205.cfi_startproc
206	xor	0($key),@S[1]
207	xor	4($key),@S[0]		# ^=key[0-3]
208	xor	8($key),@S[3]
209	xor	12($key),@S[2]
210.align	16
211.Leloop:
212	mov	16($key),$t1		# prefetch key[4-5]
213	mov	20($key),$t0
214
215___
216	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
217$code.=<<___;
218	lea	16*4($key),$key
219	cmp	$keyend,$key
220	mov	8($key),$t3		# prefetch key[2-3]
221	mov	12($key),$t2
222	je	.Ledone
223
224	and	@S[0],$t0
225	or	@S[3],$t3
226	rol	\$1,$t0
227	xor	$t3,@S[2]		# s2^=s3|key[3];
228	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
229	and	@S[2],$t2
230	or	@S[1],$t1
231	rol	\$1,$t2
232	xor	$t1,@S[0]		# s0^=s1|key[1];
233	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
234	jmp	.Leloop
235
236.align	16
237.Ledone:
238	xor	@S[2],$t0		# SwapHalf
239	xor	@S[3],$t1
240	xor	@S[0],$t2
241	xor	@S[1],$t3
242
243	mov	$t0,@S[0]
244	mov	$t1,@S[1]
245	mov	$t2,@S[2]
246	mov	$t3,@S[3]
247
248	.byte	0xf3,0xc3		# rep ret
249.cfi_endproc
250.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
251
252# V1.x API
253.globl	Camellia_DecryptBlock
254.type	Camellia_DecryptBlock,\@abi-omnipotent
255.align	16
256Camellia_DecryptBlock:
257.cfi_startproc
258	movl	\$128,%eax
259	subl	$arg0d,%eax
260	movl	\$3,$arg0d
261	adcl	\$0,$arg0d	# keyBitLength==128?3:4
262	jmp	.Ldec_rounds
263.cfi_endproc
264.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock
265# V2
266.globl	Camellia_DecryptBlock_Rounds
267.type	Camellia_DecryptBlock_Rounds,\@function,4
268.align	16
269.Ldec_rounds:
270Camellia_DecryptBlock_Rounds:
271.cfi_startproc
272	push	%rbx
273.cfi_push	%rbx
274	push	%rbp
275.cfi_push	%rbp
276	push	%r13
277.cfi_push	%r13
278	push	%r14
279.cfi_push	%r14
280	push	%r15
281.cfi_push	%r15
282.Ldec_prologue:
283
284	#mov	%rsi,$inp		# put away arguments
285	mov	%rcx,$out
286	mov	%rdx,$keyend
287
288	shl	\$6,%edi		# process grandRounds
289	lea	.LCamellia_SBOX(%rip),$Tbl
290	lea	($keyend,%rdi),$key
291
292	mov	0(%rsi),@S[0]		# load plaintext
293	mov	4(%rsi),@S[1]
294	mov	8(%rsi),@S[2]
295	bswap	@S[0]
296	mov	12(%rsi),@S[3]
297	bswap	@S[1]
298	bswap	@S[2]
299	bswap	@S[3]
300
301	call	_x86_64_Camellia_decrypt
302
303	bswap	@S[0]
304	bswap	@S[1]
305	bswap	@S[2]
306	mov	@S[0],0($out)
307	bswap	@S[3]
308	mov	@S[1],4($out)
309	mov	@S[2],8($out)
310	mov	@S[3],12($out)
311
312	mov	0(%rsp),%r15
313.cfi_restore	%r15
314	mov	8(%rsp),%r14
315.cfi_restore	%r14
316	mov	16(%rsp),%r13
317.cfi_restore	%r13
318	mov	24(%rsp),%rbp
319.cfi_restore	%rbp
320	mov	32(%rsp),%rbx
321.cfi_restore	%rbx
322	lea	40(%rsp),%rsp
323.cfi_adjust_cfa_offset	-40
324.Ldec_epilogue:
325	ret
326.cfi_endproc
327.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
328
329.type	_x86_64_Camellia_decrypt,\@abi-omnipotent
330.align	16
331_x86_64_Camellia_decrypt:
332.cfi_startproc
333	xor	0($key),@S[1]
334	xor	4($key),@S[0]		# ^=key[0-3]
335	xor	8($key),@S[3]
336	xor	12($key),@S[2]
337.align	16
338.Ldloop:
339	mov	-8($key),$t1		# prefetch key[4-5]
340	mov	-4($key),$t0
341
342___
343	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
344$code.=<<___;
345	lea	-16*4($key),$key
346	cmp	$keyend,$key
347	mov	0($key),$t3		# prefetch key[2-3]
348	mov	4($key),$t2
349	je	.Lddone
350
351	and	@S[0],$t0
352	or	@S[3],$t3
353	rol	\$1,$t0
354	xor	$t3,@S[2]		# s2^=s3|key[3];
355	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
356	and	@S[2],$t2
357	or	@S[1],$t1
358	rol	\$1,$t2
359	xor	$t1,@S[0]		# s0^=s1|key[1];
360	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
361
362	jmp	.Ldloop
363
364.align	16
365.Lddone:
366	xor	@S[2],$t2
367	xor	@S[3],$t3
368	xor	@S[0],$t0
369	xor	@S[1],$t1
370
371	mov	$t2,@S[0]		# SwapHalf
372	mov	$t3,@S[1]
373	mov	$t0,@S[2]
374	mov	$t1,@S[3]
375
376	.byte	0xf3,0xc3		# rep ret
377.cfi_endproc
378.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
379___
380
381sub _saveround {
382my ($rnd,$key,@T)=@_;
383my $bias=int(@T[0])?shift(@T):0;
384
385    if ($#T==3) {
386	$code.=<<___;
387	mov	@T[1],`$bias+$rnd*8+0`($key)
388	mov	@T[0],`$bias+$rnd*8+4`($key)
389	mov	@T[3],`$bias+$rnd*8+8`($key)
390	mov	@T[2],`$bias+$rnd*8+12`($key)
391___
392    } else {
393	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";
394	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);
395    }
396}
397
398sub _loadround {
399my ($rnd,$key,@T)=@_;
400my $bias=int(@T[0])?shift(@T):0;
401
402$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";
403$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);
404}
405
406# shld is very slow on Intel EM64T family. Even on AMD it limits
407# instruction decode rate [because it's VectorPath] and consequently
408# performance...
409sub __rotl128 {
410my ($i0,$i1,$rot)=@_;
411
412    if ($rot) {
413	$code.=<<___;
414	mov	$i0,%r11
415	shld	\$$rot,$i1,$i0
416	shld	\$$rot,%r11,$i1
417___
418    }
419}
420
421# ... Implementing 128-bit rotate without shld gives 80% better
422# performance EM64T, +15% on AMD64 and only ~7% degradation on
423# Core2. This is therefore preferred.
424sub _rotl128 {
425my ($i0,$i1,$rot)=@_;
426
427    if ($rot) {
428	$code.=<<___;
429	mov	$i0,%r11
430	shl	\$$rot,$i0
431	mov	$i1,%r9
432	shr	\$`64-$rot`,%r9
433	shr	\$`64-$rot`,%r11
434	or	%r9,$i0
435	shl	\$$rot,$i1
436	or	%r11,$i1
437___
438    }
439}
440
441{ my $step=0;
442
443$code.=<<___;
444.globl	Camellia_Ekeygen
445.type	Camellia_Ekeygen,\@function,3
446.align	16
447Camellia_Ekeygen:
448.cfi_startproc
449	push	%rbx
450.cfi_push	%rbx
451	push	%rbp
452.cfi_push	%rbp
453	push	%r13
454.cfi_push	%r13
455	push	%r14
456.cfi_push	%r14
457	push	%r15
458.cfi_push	%r15
459.Lkey_prologue:
460
461	mov	%edi,${keyend}d		# put away arguments, keyBitLength
462	mov	%rdx,$out		# keyTable
463
464	mov	0(%rsi),@S[0]		# load 0-127 bits
465	mov	4(%rsi),@S[1]
466	mov	8(%rsi),@S[2]
467	mov	12(%rsi),@S[3]
468
469	bswap	@S[0]
470	bswap	@S[1]
471	bswap	@S[2]
472	bswap	@S[3]
473___
474	&_saveround	(0,$out,@S);	# KL<<<0
475$code.=<<___;
476	cmp	\$128,$keyend		# check keyBitLength
477	je	.L1st128
478
479	mov	16(%rsi),@S[0]		# load 128-191 bits
480	mov	20(%rsi),@S[1]
481	cmp	\$192,$keyend
482	je	.L1st192
483	mov	24(%rsi),@S[2]		# load 192-255 bits
484	mov	28(%rsi),@S[3]
485	jmp	.L1st256
486.L1st192:
487	mov	@S[0],@S[2]
488	mov	@S[1],@S[3]
489	not	@S[2]
490	not	@S[3]
491.L1st256:
492	bswap	@S[0]
493	bswap	@S[1]
494	bswap	@S[2]
495	bswap	@S[3]
496___
497	&_saveround	(4,$out,@S);	# temp storage for KR!
498$code.=<<___;
499	xor	0($out),@S[1]		# KR^KL
500	xor	4($out),@S[0]
501	xor	8($out),@S[3]
502	xor	12($out),@S[2]
503
504.L1st128:
505	lea	.LCamellia_SIGMA(%rip),$key
506	lea	.LCamellia_SBOX(%rip),$Tbl
507
508	mov	0($key),$t1
509	mov	4($key),$t0
510___
511	&Camellia_Feistel($step++);
512	&Camellia_Feistel($step++);
513$code.=<<___;
514	xor	0($out),@S[1]		# ^KL
515	xor	4($out),@S[0]
516	xor	8($out),@S[3]
517	xor	12($out),@S[2]
518___
519	&Camellia_Feistel($step++);
520	&Camellia_Feistel($step++);
521$code.=<<___;
522	cmp	\$128,$keyend
523	jne	.L2nd256
524
525	lea	128($out),$out		# size optimization
526	shl	\$32,%r8		# @S[0]||
527	shl	\$32,%r10		# @S[2]||
528	or	%r9,%r8			# ||@S[1]
529	or	%r11,%r10		# ||@S[3]
530___
531	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL
532	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0
533	&_rotl128	("%rax","%rbx",15);
534	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15
535	&_rotl128	("%r8","%r10",15);
536	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15
537	&_rotl128	("%r8","%r10",15);		# 15+15=30
538	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30
539	&_rotl128	("%rax","%rbx",30);		# 15+30=45
540	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45
541	&_rotl128	("%r8","%r10",15);		# 30+15=45
542	&_saveround	(12,$out,-128,"%r8");		# KA<<<45
543	&_rotl128	("%rax","%rbx",15);		# 45+15=60
544	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60
545	&_rotl128	("%r8","%r10",15);		# 45+15=60
546	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60
547	&_rotl128	("%rax","%rbx",17);		# 60+17=77
548	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77
549	&_rotl128	("%rax","%rbx",17);		# 77+17=94
550	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94
551	&_rotl128	("%r8","%r10",34);		# 60+34=94
552	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94
553	&_rotl128	("%rax","%rbx",17);		# 94+17=111
554	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111
555	&_rotl128	("%r8","%r10",17);		# 94+17=111
556	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111
557$code.=<<___;
558	mov	\$3,%eax
559	jmp	.Ldone
560.align	16
561.L2nd256:
562___
563	&_saveround	(6,$out,@S);	# temp storage for KA!
564$code.=<<___;
565	xor	`4*8+0`($out),@S[1]	# KA^KR
566	xor	`4*8+4`($out),@S[0]
567	xor	`5*8+0`($out),@S[3]
568	xor	`5*8+4`($out),@S[2]
569___
570	&Camellia_Feistel($step++);
571	&Camellia_Feistel($step++);
572
573	&_loadround	(0,$out,"%rax","%rbx");	# KL
574	&_loadround	(4,$out,"%rcx","%rdx");	# KR
575	&_loadround	(6,$out,"%r14","%r15");	# KA
576$code.=<<___;
577	lea	128($out),$out		# size optimization
578	shl	\$32,%r8		# @S[0]||
579	shl	\$32,%r10		# @S[2]||
580	or	%r9,%r8			# ||@S[1]
581	or	%r11,%r10		# ||@S[3]
582___
583	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0
584	&_rotl128	("%rcx","%rdx",15);
585	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15
586	&_rotl128	("%r14","%r15",15);
587	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15
588	&_rotl128	("%rcx","%rdx",15);		# 15+15=30
589	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30
590	&_rotl128	("%r8","%r10",30);
591	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30
592	&_rotl128	("%rax","%rbx",45);
593	&_saveround	(12,$out,-128,"%rax","%rbx");	# KL<<<45
594	&_rotl128	("%r14","%r15",30);		# 15+30=45
595	&_saveround	(14,$out,-128,"%r14","%r15");	# KA<<<45
596	&_rotl128	("%rax","%rbx",15);		# 45+15=60
597	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<60
598	&_rotl128	("%rcx","%rdx",30);		# 30+30=60
599	&_saveround	(18,$out,-128,"%rcx","%rdx");	# KR<<<60
600	&_rotl128	("%r8","%r10",30);		# 30+30=60
601	&_saveround	(20,$out,-128,"%r8","%r10");	# KB<<<60
602	&_rotl128	("%rax","%rbx",17);		# 60+17=77
603	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<77
604	&_rotl128	("%r14","%r15",32);		# 45+32=77
605	&_saveround	(24,$out,-128,"%r14","%r15");	# KA<<<77
606	&_rotl128	("%rcx","%rdx",34);		# 60+34=94
607	&_saveround	(26,$out,-128,"%rcx","%rdx");	# KR<<<94
608	&_rotl128	("%r14","%r15",17);		# 77+17=94
609	&_saveround	(28,$out,-128,"%r14","%r15");	# KA<<<77
610	&_rotl128	("%rax","%rbx",34);		# 77+34=111
611	&_saveround	(30,$out,-128,"%rax","%rbx");	# KL<<<111
612	&_rotl128	("%r8","%r10",51);		# 60+51=111
613	&_saveround	(32,$out,-128,"%r8","%r10");	# KB<<<111
614$code.=<<___;
615	mov	\$4,%eax
616.Ldone:
617	mov	0(%rsp),%r15
618.cfi_restore	%r15
619	mov	8(%rsp),%r14
620.cfi_restore	%r14
621	mov	16(%rsp),%r13
622.cfi_restore	%r13
623	mov	24(%rsp),%rbp
624.cfi_restore	%rbp
625	mov	32(%rsp),%rbx
626.cfi_restore	%rbx
627	lea	40(%rsp),%rsp
628.cfi_adjust_cfa_offset	-40
629.Lkey_epilogue:
630	ret
631.cfi_endproc
632.size	Camellia_Ekeygen,.-Camellia_Ekeygen
633___
634}
635
636@SBOX=(
637112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
638 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
639134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
640166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
641139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
642223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
643 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
644254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
645170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
646 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
647135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
648 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
649233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
650120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
651114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
652 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
653
654sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
655sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
656sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
657sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
658
659$code.=<<___;
660.align	64
661.LCamellia_SIGMA:
662.long	0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
663.long	0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
664.long	0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
665.long	0,          0,          0,          0
666.LCamellia_SBOX:
667___
668# tables are interleaved, remember?
669sub data_word { $code.=".long\t".join(',',@_)."\n"; }
670for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
671for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
672
673# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
674#			size_t length, const CAMELLIA_KEY *key,
675#			unsigned char *ivp,const int enc);
676{
677$_key="0(%rsp)";
678$_end="8(%rsp)";	# inp+len&~15
679$_res="16(%rsp)";	# len&15
680$ivec="24(%rsp)";
681$_ivp="40(%rsp)";
682$_rsp="48(%rsp)";
683
684$code.=<<___;
685.globl	Camellia_cbc_encrypt
686.type	Camellia_cbc_encrypt,\@function,6
687.align	16
688Camellia_cbc_encrypt:
689.cfi_startproc
690	endbranch
691	cmp	\$0,%rdx
692	je	.Lcbc_abort
693	push	%rbx
694.cfi_push	%rbx
695	push	%rbp
696.cfi_push	%rbp
697	push	%r12
698.cfi_push	%r12
699	push	%r13
700.cfi_push	%r13
701	push	%r14
702.cfi_push	%r14
703	push	%r15
704.cfi_push	%r15
705.Lcbc_prologue:
706
707	mov	%rsp,%rbp
708.cfi_def_cfa_register	%rbp
709	sub	\$64,%rsp
710	and	\$-64,%rsp
711
712	# place stack frame just "above mod 1024" the key schedule,
713	# this ensures that cache associativity suffices
714	lea	-64-63(%rcx),%r10
715	sub	%rsp,%r10
716	neg	%r10
717	and	\$0x3C0,%r10
718	sub	%r10,%rsp
719	#add	\$8,%rsp		# 8 is reserved for callee's ra
720
721	mov	%rdi,$inp		# inp argument
722	mov	%rsi,$out		# out argument
723	mov	%r8,%rbx		# ivp argument
724	mov	%rcx,$key		# key argument
725	mov	272(%rcx),${keyend}d	# grandRounds
726
727	mov	%r8,$_ivp
728	mov	%rbp,$_rsp
729.cfi_cfa_expression	$_rsp,deref,+56
730
731.Lcbc_body:
732	lea	.LCamellia_SBOX(%rip),$Tbl
733
734	mov	\$32,%ecx
735.align	4
736.Lcbc_prefetch_sbox:
737	mov	0($Tbl),%rax
738	mov	32($Tbl),%rsi
739	mov	64($Tbl),%rdi
740	mov	96($Tbl),%r11
741	lea	128($Tbl),$Tbl
742	loop	.Lcbc_prefetch_sbox
743	sub	\$4096,$Tbl
744	shl	\$6,$keyend
745	mov	%rdx,%rcx		# len argument
746	lea	($key,$keyend),$keyend
747
748	cmp	\$0,%r9d		# enc argument
749	je	.LCBC_DECRYPT
750
751	and	\$-16,%rdx
752	and	\$15,%rcx		# length residue
753	lea	($inp,%rdx),%rdx
754	mov	$key,$_key
755	mov	%rdx,$_end
756	mov	%rcx,$_res
757
758	cmp	$inp,%rdx
759	mov	0(%rbx),@S[0]		# load IV
760	mov	4(%rbx),@S[1]
761	mov	8(%rbx),@S[2]
762	mov	12(%rbx),@S[3]
763	je	.Lcbc_enc_tail
764	jmp	.Lcbc_eloop
765
766.align	16
767.Lcbc_eloop:
768	xor	0($inp),@S[0]
769	xor	4($inp),@S[1]
770	xor	8($inp),@S[2]
771	bswap	@S[0]
772	xor	12($inp),@S[3]
773	bswap	@S[1]
774	bswap	@S[2]
775	bswap	@S[3]
776
777	call	_x86_64_Camellia_encrypt
778
779	mov	$_key,$key		# "rewind" the key
780	bswap	@S[0]
781	mov	$_end,%rdx
782	bswap	@S[1]
783	mov	$_res,%rcx
784	bswap	@S[2]
785	mov	@S[0],0($out)
786	bswap	@S[3]
787	mov	@S[1],4($out)
788	mov	@S[2],8($out)
789	lea	16($inp),$inp
790	mov	@S[3],12($out)
791	cmp	%rdx,$inp
792	lea	16($out),$out
793	jne	.Lcbc_eloop
794
795	cmp	\$0,%rcx
796	jne	.Lcbc_enc_tail
797
798	mov	$_ivp,$out
799	mov	@S[0],0($out)		# write out IV residue
800	mov	@S[1],4($out)
801	mov	@S[2],8($out)
802	mov	@S[3],12($out)
803	jmp	.Lcbc_done
804
805.align	16
806.Lcbc_enc_tail:
807	xor	%rax,%rax
808	mov	%rax,0+$ivec
809	mov	%rax,8+$ivec
810	mov	%rax,$_res
811
812.Lcbc_enc_pushf:
813	pushfq
814	cld
815	mov	$inp,%rsi
816	lea	8+$ivec,%rdi
817	.long	0x9066A4F3		# rep movsb
818	popfq
819.Lcbc_enc_popf:
820
821	lea	$ivec,$inp
822	lea	16+$ivec,%rax
823	mov	%rax,$_end
824	jmp	.Lcbc_eloop		# one more time
825
826.align	16
827.LCBC_DECRYPT:
828	xchg	$key,$keyend
829	add	\$15,%rdx
830	and	\$15,%rcx		# length residue
831	and	\$-16,%rdx
832	mov	$key,$_key
833	lea	($inp,%rdx),%rdx
834	mov	%rdx,$_end
835	mov	%rcx,$_res
836
837	mov	(%rbx),%rax		# load IV
838	mov	8(%rbx),%rbx
839	jmp	.Lcbc_dloop
840.align	16
841.Lcbc_dloop:
842	mov	0($inp),@S[0]
843	mov	4($inp),@S[1]
844	mov	8($inp),@S[2]
845	bswap	@S[0]
846	mov	12($inp),@S[3]
847	bswap	@S[1]
848	mov	%rax,0+$ivec		# save IV to temporary storage
849	bswap	@S[2]
850	mov	%rbx,8+$ivec
851	bswap	@S[3]
852
853	call	_x86_64_Camellia_decrypt
854
855	mov	$_key,$key		# "rewind" the key
856	mov	$_end,%rdx
857	mov	$_res,%rcx
858
859	bswap	@S[0]
860	mov	($inp),%rax		# load IV for next iteration
861	bswap	@S[1]
862	mov	8($inp),%rbx
863	bswap	@S[2]
864	xor	0+$ivec,@S[0]
865	bswap	@S[3]
866	xor	4+$ivec,@S[1]
867	xor	8+$ivec,@S[2]
868	lea	16($inp),$inp
869	xor	12+$ivec,@S[3]
870	cmp	%rdx,$inp
871	je	.Lcbc_ddone
872
873	mov	@S[0],0($out)
874	mov	@S[1],4($out)
875	mov	@S[2],8($out)
876	mov	@S[3],12($out)
877
878	lea	16($out),$out
879	jmp	.Lcbc_dloop
880
881.align	16
882.Lcbc_ddone:
883	mov	$_ivp,%rdx
884	cmp	\$0,%rcx
885	jne	.Lcbc_dec_tail
886
887	mov	@S[0],0($out)
888	mov	@S[1],4($out)
889	mov	@S[2],8($out)
890	mov	@S[3],12($out)
891
892	mov	%rax,(%rdx)		# write out IV residue
893	mov	%rbx,8(%rdx)
894	jmp	.Lcbc_done
895.align	16
896.Lcbc_dec_tail:
897	mov	@S[0],0+$ivec
898	mov	@S[1],4+$ivec
899	mov	@S[2],8+$ivec
900	mov	@S[3],12+$ivec
901
902.Lcbc_dec_pushf:
903	pushfq
904	cld
905	lea	8+$ivec,%rsi
906	lea	($out),%rdi
907	.long	0x9066A4F3		# rep movsb
908	popfq
909.Lcbc_dec_popf:
910
911	mov	%rax,(%rdx)		# write out IV residue
912	mov	%rbx,8(%rdx)
913	jmp	.Lcbc_done
914
915.align	16
916.Lcbc_done:
917	mov	$_rsp,%rcx
918.cfi_def_cfa	%rcx,56
919	mov	0(%rcx),%r15
920.cfi_restore	%r15
921	mov	8(%rcx),%r14
922.cfi_restore	%r14
923	mov	16(%rcx),%r13
924.cfi_restore	%r13
925	mov	24(%rcx),%r12
926.cfi_restore	%r12
927	mov	32(%rcx),%rbp
928.cfi_restore	%rbp
929	mov	40(%rcx),%rbx
930.cfi_restore	%rbx
931	lea	48(%rcx),%rsp
932.cfi_def_cfa	%rsp,8
933.Lcbc_abort:
934	ret
935.cfi_endproc
936.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
937
938.asciz	"Camellia for x86_64 by <appro\@openssl.org>"
939___
940}
941
942# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
943#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
944if ($win64) {
945$rec="%rcx";
946$frame="%rdx";
947$context="%r8";
948$disp="%r9";
949
950$code.=<<___;
951.extern	__imp_RtlVirtualUnwind
952.type	common_se_handler,\@abi-omnipotent
953.align	16
954common_se_handler:
955	push	%rsi
956	push	%rdi
957	push	%rbx
958	push	%rbp
959	push	%r12
960	push	%r13
961	push	%r14
962	push	%r15
963	pushfq
964	lea	-64(%rsp),%rsp
965
966	mov	120($context),%rax	# pull context->Rax
967	mov	248($context),%rbx	# pull context->Rip
968
969	mov	8($disp),%rsi		# disp->ImageBase
970	mov	56($disp),%r11		# disp->HandlerData
971
972	mov	0(%r11),%r10d		# HandlerData[0]
973	lea	(%rsi,%r10),%r10	# prologue label
974	cmp	%r10,%rbx		# context->Rip<prologue label
975	jb	.Lin_prologue
976
977	mov	152($context),%rax	# pull context->Rsp
978
979	mov	4(%r11),%r10d		# HandlerData[1]
980	lea	(%rsi,%r10),%r10	# epilogue label
981	cmp	%r10,%rbx		# context->Rip>=epilogue label
982	jae	.Lin_prologue
983
984	lea	40(%rax),%rax
985	mov	-8(%rax),%rbx
986	mov	-16(%rax),%rbp
987	mov	-24(%rax),%r13
988	mov	-32(%rax),%r14
989	mov	-40(%rax),%r15
990	mov	%rbx,144($context)	# restore context->Rbx
991	mov	%rbp,160($context)	# restore context->Rbp
992	mov	%r13,224($context)	# restore context->R13
993	mov	%r14,232($context)	# restore context->R14
994	mov	%r15,240($context)	# restore context->R15
995
996.Lin_prologue:
997	mov	8(%rax),%rdi
998	mov	16(%rax),%rsi
999	mov	%rax,152($context)	# restore context->Rsp
1000	mov	%rsi,168($context)	# restore context->Rsi
1001	mov	%rdi,176($context)	# restore context->Rdi
1002
1003	jmp	.Lcommon_seh_exit
1004.size	common_se_handler,.-common_se_handler
1005
1006.type	cbc_se_handler,\@abi-omnipotent
1007.align	16
1008cbc_se_handler:
1009	push	%rsi
1010	push	%rdi
1011	push	%rbx
1012	push	%rbp
1013	push	%r12
1014	push	%r13
1015	push	%r14
1016	push	%r15
1017	pushfq
1018	lea	-64(%rsp),%rsp
1019
1020	mov	120($context),%rax	# pull context->Rax
1021	mov	248($context),%rbx	# pull context->Rip
1022
1023	lea	.Lcbc_prologue(%rip),%r10
1024	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
1025	jb	.Lin_cbc_prologue
1026
1027	lea	.Lcbc_body(%rip),%r10
1028	cmp	%r10,%rbx		# context->Rip<.Lcbc_body
1029	jb	.Lin_cbc_frame_setup
1030
1031	mov	152($context),%rax	# pull context->Rsp
1032
1033	lea	.Lcbc_abort(%rip),%r10
1034	cmp	%r10,%rbx		# context->Rip>=.Lcbc_abort
1035	jae	.Lin_cbc_prologue
1036
1037	# handle pushf/popf in Camellia_cbc_encrypt
1038	lea	.Lcbc_enc_pushf(%rip),%r10
1039	cmp	%r10,%rbx		# context->Rip<=.Lcbc_enc_pushf
1040	jbe	.Lin_cbc_no_flag
1041	lea	8(%rax),%rax
1042	lea	.Lcbc_enc_popf(%rip),%r10
1043	cmp	%r10,%rbx		# context->Rip<.Lcbc_enc_popf
1044	jb	.Lin_cbc_no_flag
1045	lea	-8(%rax),%rax
1046	lea	.Lcbc_dec_pushf(%rip),%r10
1047	cmp	%r10,%rbx		# context->Rip<=.Lcbc_dec_pushf
1048	jbe	.Lin_cbc_no_flag
1049	lea	8(%rax),%rax
1050	lea	.Lcbc_dec_popf(%rip),%r10
1051	cmp	%r10,%rbx		# context->Rip<.Lcbc_dec_popf
1052	jb	.Lin_cbc_no_flag
1053	lea	-8(%rax),%rax
1054
1055.Lin_cbc_no_flag:
1056	mov	48(%rax),%rax		# $_rsp
1057	lea	48(%rax),%rax
1058
1059.Lin_cbc_frame_setup:
1060	mov	-8(%rax),%rbx
1061	mov	-16(%rax),%rbp
1062	mov	-24(%rax),%r12
1063	mov	-32(%rax),%r13
1064	mov	-40(%rax),%r14
1065	mov	-48(%rax),%r15
1066	mov	%rbx,144($context)	# restore context->Rbx
1067	mov	%rbp,160($context)	# restore context->Rbp
1068	mov	%r12,216($context)	# restore context->R12
1069	mov	%r13,224($context)	# restore context->R13
1070	mov	%r14,232($context)	# restore context->R14
1071	mov	%r15,240($context)	# restore context->R15
1072
1073.Lin_cbc_prologue:
1074	mov	8(%rax),%rdi
1075	mov	16(%rax),%rsi
1076	mov	%rax,152($context)	# restore context->Rsp
1077	mov	%rsi,168($context)	# restore context->Rsi
1078	mov	%rdi,176($context)	# restore context->Rdi
1079
1080.align	4
1081.Lcommon_seh_exit:
1082
1083	mov	40($disp),%rdi		# disp->ContextRecord
1084	mov	$context,%rsi		# context
1085	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1086	.long	0xa548f3fc		# cld; rep movsq
1087
1088	mov	$disp,%rsi
1089	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1090	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1091	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1092	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1093	mov	40(%rsi),%r10		# disp->ContextRecord
1094	lea	56(%rsi),%r11		# &disp->HandlerData
1095	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1096	mov	%r10,32(%rsp)		# arg5
1097	mov	%r11,40(%rsp)		# arg6
1098	mov	%r12,48(%rsp)		# arg7
1099	mov	%rcx,56(%rsp)		# arg8, (NULL)
1100	call	*__imp_RtlVirtualUnwind(%rip)
1101
1102	mov	\$1,%eax		# ExceptionContinueSearch
1103	lea	64(%rsp),%rsp
1104	popfq
1105	pop	%r15
1106	pop	%r14
1107	pop	%r13
1108	pop	%r12
1109	pop	%rbp
1110	pop	%rbx
1111	pop	%rdi
1112	pop	%rsi
1113	ret
1114.size	cbc_se_handler,.-cbc_se_handler
1115
1116.section	.pdata
1117.align	4
1118	.rva	.LSEH_begin_Camellia_EncryptBlock_Rounds
1119	.rva	.LSEH_end_Camellia_EncryptBlock_Rounds
1120	.rva	.LSEH_info_Camellia_EncryptBlock_Rounds
1121
1122	.rva	.LSEH_begin_Camellia_DecryptBlock_Rounds
1123	.rva	.LSEH_end_Camellia_DecryptBlock_Rounds
1124	.rva	.LSEH_info_Camellia_DecryptBlock_Rounds
1125
1126	.rva	.LSEH_begin_Camellia_Ekeygen
1127	.rva	.LSEH_end_Camellia_Ekeygen
1128	.rva	.LSEH_info_Camellia_Ekeygen
1129
1130	.rva	.LSEH_begin_Camellia_cbc_encrypt
1131	.rva	.LSEH_end_Camellia_cbc_encrypt
1132	.rva	.LSEH_info_Camellia_cbc_encrypt
1133
1134.section	.xdata
1135.align	8
1136.LSEH_info_Camellia_EncryptBlock_Rounds:
1137	.byte	9,0,0,0
1138	.rva	common_se_handler
1139	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
1140.LSEH_info_Camellia_DecryptBlock_Rounds:
1141	.byte	9,0,0,0
1142	.rva	common_se_handler
1143	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
1144.LSEH_info_Camellia_Ekeygen:
1145	.byte	9,0,0,0
1146	.rva	common_se_handler
1147	.rva	.Lkey_prologue,.Lkey_epilogue	# HandlerData[]
1148.LSEH_info_Camellia_cbc_encrypt:
1149	.byte	9,0,0,0
1150	.rva	cbc_se_handler
1151___
1152}
1153
1154$code =~ s/\`([^\`]*)\`/eval $1/gem;
1155print $code;
1156close STDOUT or die "error closing STDOUT: $!";
1157