1#! /usr/bin/env perl 2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12# 13# This module may be used under the terms of either the GNU General 14# Public License version 2 or later, the GNU Lesser General Public 15# License version 2.1 or later, the Mozilla Public License version 16# 1.1 or the BSD License. The exact terms of either license are 17# distributed along with this module. For further details see 18# http://www.openssl.org/~appro/camellia/. 19# ==================================================================== 20 21# Performance in cycles per processed byte (less is better) in 22# 'openssl speed ...' benchmark: 23# 24# AMD64 Core2 EM64T 25# -evp camellia-128-ecb 16.7 21.0 22.7 26# + over gcc 3.4.6 +25% +5% 0% 27# 28# camellia-128-cbc 15.7 20.4 21.1 29# 30# 128-bit key setup 128 216 205 cycles/key 31# + over gcc 3.4.6 +54% +39% +15% 32# 33# Numbers in "+" rows represent performance improvement over compiler 34# generated code. Key setup timings are impressive on AMD and Core2 35# thanks to 64-bit operations being covertly deployed. Improvement on 36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it 37# apparently emulates some of 64-bit operations in [32-bit] microcode. 38 39# $output is the last argument if it looks like a file (it has an extension) 40# $flavour is the first argument if it doesn't look like a file 41$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 42$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 43 44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 49die "can't locate x86_64-xlate.pl"; 50 51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 52 or die "can't call $xlate: $!"; 53*STDOUT=*OUT; 54 55sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 56sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 57 $r =~ s/%[er]([sd]i)/%\1l/; 58 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 59 60$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; 61@S=("%r8d","%r9d","%r10d","%r11d"); 62$i0="%esi"; 63$i1="%edi"; 64$Tbl="%rbp"; # size optimization 65$inp="%r12"; 66$out="%r13"; 67$key="%r14"; 68$keyend="%r15"; 69$arg0d=$win64?"%ecx":"%edi"; 70 71# const unsigned int Camellia_SBOX[4][256]; 72# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 73# and [2][] - with [3][]. This is done to minimize code size. 74$SBOX1_1110=0; # Camellia_SBOX[0] 75$SBOX4_4404=4; # Camellia_SBOX[1] 76$SBOX2_0222=2048; # Camellia_SBOX[2] 77$SBOX3_3033=2052; # Camellia_SBOX[3] 78 79sub Camellia_Feistel { 80my $i=@_[0]; 81my $seed=defined(@_[1])?@_[1]:0; 82my $scale=$seed<0?-8:8; 83my $j=($i&1)*2; 84my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]); 85 86$code.=<<___; 87 xor $s0,$t0 # t0^=key[0] 88 xor $s1,$t1 # t1^=key[1] 89 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff 90 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff 91 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] 92 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] 93 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff 94 shr \$16,$t0 95 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff 96 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] 97 shr \$16,$t1 98 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] 99 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff 100 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff 101 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] 102 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] 103 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff 104 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff 105 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] 106 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] 107 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] 108 mov `$seed+($i+1)*$scale+4`($key),$t0 109 xor $t3,$t2 # t2^=t3 110 ror \$8,$t3 # t3=RightRotate(t3,8) 111 xor $t2,$s2 112 xor $t2,$s3 113 xor $t3,$s3 114___ 115} 116 117# void Camellia_EncryptBlock_Rounds( 118# int grandRounds, 119# const Byte plaintext[], 120# const KEY_TABLE_TYPE keyTable, 121# Byte ciphertext[]) 122$code=<<___; 123.text 124 125# V1.x API 126.globl Camellia_EncryptBlock 127.type Camellia_EncryptBlock,\@abi-omnipotent 128.align 16 129Camellia_EncryptBlock: 130.cfi_startproc 131 movl \$128,%eax 132 subl $arg0d,%eax 133 movl \$3,$arg0d 134 adcl \$0,$arg0d # keyBitLength==128?3:4 135 jmp .Lenc_rounds 136.cfi_endproc 137.size Camellia_EncryptBlock,.-Camellia_EncryptBlock 138# V2 139.globl Camellia_EncryptBlock_Rounds 140.type Camellia_EncryptBlock_Rounds,\@function,4 141.align 16 142.Lenc_rounds: 143Camellia_EncryptBlock_Rounds: 144.cfi_startproc 145 push %rbx 146.cfi_push %rbx 147 push %rbp 148.cfi_push %rbp 149 push %r13 150.cfi_push %r13 151 push %r14 152.cfi_push %r14 153 push %r15 154.cfi_push %r15 155.Lenc_prologue: 156 157 #mov %rsi,$inp # put away arguments 158 mov %rcx,$out 159 mov %rdx,$key 160 161 shl \$6,%edi # process grandRounds 162 lea .LCamellia_SBOX(%rip),$Tbl 163 lea ($key,%rdi),$keyend 164 165 mov 0(%rsi),@S[0] # load plaintext 166 mov 4(%rsi),@S[1] 167 mov 8(%rsi),@S[2] 168 bswap @S[0] 169 mov 12(%rsi),@S[3] 170 bswap @S[1] 171 bswap @S[2] 172 bswap @S[3] 173 174 call _x86_64_Camellia_encrypt 175 176 bswap @S[0] 177 bswap @S[1] 178 bswap @S[2] 179 mov @S[0],0($out) 180 bswap @S[3] 181 mov @S[1],4($out) 182 mov @S[2],8($out) 183 mov @S[3],12($out) 184 185 mov 0(%rsp),%r15 186.cfi_restore %r15 187 mov 8(%rsp),%r14 188.cfi_restore %r14 189 mov 16(%rsp),%r13 190.cfi_restore %r13 191 mov 24(%rsp),%rbp 192.cfi_restore %rbp 193 mov 32(%rsp),%rbx 194.cfi_restore %rbx 195 lea 40(%rsp),%rsp 196.cfi_adjust_cfa_offset -40 197.Lenc_epilogue: 198 ret 199.cfi_endproc 200.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds 201 202.type _x86_64_Camellia_encrypt,\@abi-omnipotent 203.align 16 204_x86_64_Camellia_encrypt: 205.cfi_startproc 206 xor 0($key),@S[1] 207 xor 4($key),@S[0] # ^=key[0-3] 208 xor 8($key),@S[3] 209 xor 12($key),@S[2] 210.align 16 211.Leloop: 212 mov 16($key),$t1 # prefetch key[4-5] 213 mov 20($key),$t0 214 215___ 216 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } 217$code.=<<___; 218 lea 16*4($key),$key 219 cmp $keyend,$key 220 mov 8($key),$t3 # prefetch key[2-3] 221 mov 12($key),$t2 222 je .Ledone 223 224 and @S[0],$t0 225 or @S[3],$t3 226 rol \$1,$t0 227 xor $t3,@S[2] # s2^=s3|key[3]; 228 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 229 and @S[2],$t2 230 or @S[1],$t1 231 rol \$1,$t2 232 xor $t1,@S[0] # s0^=s1|key[1]; 233 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 234 jmp .Leloop 235 236.align 16 237.Ledone: 238 xor @S[2],$t0 # SwapHalf 239 xor @S[3],$t1 240 xor @S[0],$t2 241 xor @S[1],$t3 242 243 mov $t0,@S[0] 244 mov $t1,@S[1] 245 mov $t2,@S[2] 246 mov $t3,@S[3] 247 248 .byte 0xf3,0xc3 # rep ret 249.cfi_endproc 250.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt 251 252# V1.x API 253.globl Camellia_DecryptBlock 254.type Camellia_DecryptBlock,\@abi-omnipotent 255.align 16 256Camellia_DecryptBlock: 257.cfi_startproc 258 movl \$128,%eax 259 subl $arg0d,%eax 260 movl \$3,$arg0d 261 adcl \$0,$arg0d # keyBitLength==128?3:4 262 jmp .Ldec_rounds 263.cfi_endproc 264.size Camellia_DecryptBlock,.-Camellia_DecryptBlock 265# V2 266.globl Camellia_DecryptBlock_Rounds 267.type Camellia_DecryptBlock_Rounds,\@function,4 268.align 16 269.Ldec_rounds: 270Camellia_DecryptBlock_Rounds: 271.cfi_startproc 272 push %rbx 273.cfi_push %rbx 274 push %rbp 275.cfi_push %rbp 276 push %r13 277.cfi_push %r13 278 push %r14 279.cfi_push %r14 280 push %r15 281.cfi_push %r15 282.Ldec_prologue: 283 284 #mov %rsi,$inp # put away arguments 285 mov %rcx,$out 286 mov %rdx,$keyend 287 288 shl \$6,%edi # process grandRounds 289 lea .LCamellia_SBOX(%rip),$Tbl 290 lea ($keyend,%rdi),$key 291 292 mov 0(%rsi),@S[0] # load plaintext 293 mov 4(%rsi),@S[1] 294 mov 8(%rsi),@S[2] 295 bswap @S[0] 296 mov 12(%rsi),@S[3] 297 bswap @S[1] 298 bswap @S[2] 299 bswap @S[3] 300 301 call _x86_64_Camellia_decrypt 302 303 bswap @S[0] 304 bswap @S[1] 305 bswap @S[2] 306 mov @S[0],0($out) 307 bswap @S[3] 308 mov @S[1],4($out) 309 mov @S[2],8($out) 310 mov @S[3],12($out) 311 312 mov 0(%rsp),%r15 313.cfi_restore %r15 314 mov 8(%rsp),%r14 315.cfi_restore %r14 316 mov 16(%rsp),%r13 317.cfi_restore %r13 318 mov 24(%rsp),%rbp 319.cfi_restore %rbp 320 mov 32(%rsp),%rbx 321.cfi_restore %rbx 322 lea 40(%rsp),%rsp 323.cfi_adjust_cfa_offset -40 324.Ldec_epilogue: 325 ret 326.cfi_endproc 327.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds 328 329.type _x86_64_Camellia_decrypt,\@abi-omnipotent 330.align 16 331_x86_64_Camellia_decrypt: 332.cfi_startproc 333 xor 0($key),@S[1] 334 xor 4($key),@S[0] # ^=key[0-3] 335 xor 8($key),@S[3] 336 xor 12($key),@S[2] 337.align 16 338.Ldloop: 339 mov -8($key),$t1 # prefetch key[4-5] 340 mov -4($key),$t0 341 342___ 343 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } 344$code.=<<___; 345 lea -16*4($key),$key 346 cmp $keyend,$key 347 mov 0($key),$t3 # prefetch key[2-3] 348 mov 4($key),$t2 349 je .Lddone 350 351 and @S[0],$t0 352 or @S[3],$t3 353 rol \$1,$t0 354 xor $t3,@S[2] # s2^=s3|key[3]; 355 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 356 and @S[2],$t2 357 or @S[1],$t1 358 rol \$1,$t2 359 xor $t1,@S[0] # s0^=s1|key[1]; 360 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 361 362 jmp .Ldloop 363 364.align 16 365.Lddone: 366 xor @S[2],$t2 367 xor @S[3],$t3 368 xor @S[0],$t0 369 xor @S[1],$t1 370 371 mov $t2,@S[0] # SwapHalf 372 mov $t3,@S[1] 373 mov $t0,@S[2] 374 mov $t1,@S[3] 375 376 .byte 0xf3,0xc3 # rep ret 377.cfi_endproc 378.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt 379___ 380 381sub _saveround { 382my ($rnd,$key,@T)=@_; 383my $bias=int(@T[0])?shift(@T):0; 384 385 if ($#T==3) { 386 $code.=<<___; 387 mov @T[1],`$bias+$rnd*8+0`($key) 388 mov @T[0],`$bias+$rnd*8+4`($key) 389 mov @T[3],`$bias+$rnd*8+8`($key) 390 mov @T[2],`$bias+$rnd*8+12`($key) 391___ 392 } else { 393 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; 394 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); 395 } 396} 397 398sub _loadround { 399my ($rnd,$key,@T)=@_; 400my $bias=int(@T[0])?shift(@T):0; 401 402$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; 403$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); 404} 405 406# shld is very slow on Intel EM64T family. Even on AMD it limits 407# instruction decode rate [because it's VectorPath] and consequently 408# performance... 409sub __rotl128 { 410my ($i0,$i1,$rot)=@_; 411 412 if ($rot) { 413 $code.=<<___; 414 mov $i0,%r11 415 shld \$$rot,$i1,$i0 416 shld \$$rot,%r11,$i1 417___ 418 } 419} 420 421# ... Implementing 128-bit rotate without shld gives 80% better 422# performance EM64T, +15% on AMD64 and only ~7% degradation on 423# Core2. This is therefore preferred. 424sub _rotl128 { 425my ($i0,$i1,$rot)=@_; 426 427 if ($rot) { 428 $code.=<<___; 429 mov $i0,%r11 430 shl \$$rot,$i0 431 mov $i1,%r9 432 shr \$`64-$rot`,%r9 433 shr \$`64-$rot`,%r11 434 or %r9,$i0 435 shl \$$rot,$i1 436 or %r11,$i1 437___ 438 } 439} 440 441{ my $step=0; 442 443$code.=<<___; 444.globl Camellia_Ekeygen 445.type Camellia_Ekeygen,\@function,3 446.align 16 447Camellia_Ekeygen: 448.cfi_startproc 449 push %rbx 450.cfi_push %rbx 451 push %rbp 452.cfi_push %rbp 453 push %r13 454.cfi_push %r13 455 push %r14 456.cfi_push %r14 457 push %r15 458.cfi_push %r15 459.Lkey_prologue: 460 461 mov %edi,${keyend}d # put away arguments, keyBitLength 462 mov %rdx,$out # keyTable 463 464 mov 0(%rsi),@S[0] # load 0-127 bits 465 mov 4(%rsi),@S[1] 466 mov 8(%rsi),@S[2] 467 mov 12(%rsi),@S[3] 468 469 bswap @S[0] 470 bswap @S[1] 471 bswap @S[2] 472 bswap @S[3] 473___ 474 &_saveround (0,$out,@S); # KL<<<0 475$code.=<<___; 476 cmp \$128,$keyend # check keyBitLength 477 je .L1st128 478 479 mov 16(%rsi),@S[0] # load 128-191 bits 480 mov 20(%rsi),@S[1] 481 cmp \$192,$keyend 482 je .L1st192 483 mov 24(%rsi),@S[2] # load 192-255 bits 484 mov 28(%rsi),@S[3] 485 jmp .L1st256 486.L1st192: 487 mov @S[0],@S[2] 488 mov @S[1],@S[3] 489 not @S[2] 490 not @S[3] 491.L1st256: 492 bswap @S[0] 493 bswap @S[1] 494 bswap @S[2] 495 bswap @S[3] 496___ 497 &_saveround (4,$out,@S); # temp storage for KR! 498$code.=<<___; 499 xor 0($out),@S[1] # KR^KL 500 xor 4($out),@S[0] 501 xor 8($out),@S[3] 502 xor 12($out),@S[2] 503 504.L1st128: 505 lea .LCamellia_SIGMA(%rip),$key 506 lea .LCamellia_SBOX(%rip),$Tbl 507 508 mov 0($key),$t1 509 mov 4($key),$t0 510___ 511 &Camellia_Feistel($step++); 512 &Camellia_Feistel($step++); 513$code.=<<___; 514 xor 0($out),@S[1] # ^KL 515 xor 4($out),@S[0] 516 xor 8($out),@S[3] 517 xor 12($out),@S[2] 518___ 519 &Camellia_Feistel($step++); 520 &Camellia_Feistel($step++); 521$code.=<<___; 522 cmp \$128,$keyend 523 jne .L2nd256 524 525 lea 128($out),$out # size optimization 526 shl \$32,%r8 # @S[0]|| 527 shl \$32,%r10 # @S[2]|| 528 or %r9,%r8 # ||@S[1] 529 or %r11,%r10 # ||@S[3] 530___ 531 &_loadround (0,$out,-128,"%rax","%rbx"); # KL 532 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 533 &_rotl128 ("%rax","%rbx",15); 534 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 535 &_rotl128 ("%r8","%r10",15); 536 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 537 &_rotl128 ("%r8","%r10",15); # 15+15=30 538 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 539 &_rotl128 ("%rax","%rbx",30); # 15+30=45 540 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 541 &_rotl128 ("%r8","%r10",15); # 30+15=45 542 &_saveround (12,$out,-128,"%r8"); # KA<<<45 543 &_rotl128 ("%rax","%rbx",15); # 45+15=60 544 &_saveround (13,$out,-128,"%rbx"); # KL<<<60 545 &_rotl128 ("%r8","%r10",15); # 45+15=60 546 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 547 &_rotl128 ("%rax","%rbx",17); # 60+17=77 548 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 549 &_rotl128 ("%rax","%rbx",17); # 77+17=94 550 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 551 &_rotl128 ("%r8","%r10",34); # 60+34=94 552 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 553 &_rotl128 ("%rax","%rbx",17); # 94+17=111 554 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 555 &_rotl128 ("%r8","%r10",17); # 94+17=111 556 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 557$code.=<<___; 558 mov \$3,%eax 559 jmp .Ldone 560.align 16 561.L2nd256: 562___ 563 &_saveround (6,$out,@S); # temp storage for KA! 564$code.=<<___; 565 xor `4*8+0`($out),@S[1] # KA^KR 566 xor `4*8+4`($out),@S[0] 567 xor `5*8+0`($out),@S[3] 568 xor `5*8+4`($out),@S[2] 569___ 570 &Camellia_Feistel($step++); 571 &Camellia_Feistel($step++); 572 573 &_loadround (0,$out,"%rax","%rbx"); # KL 574 &_loadround (4,$out,"%rcx","%rdx"); # KR 575 &_loadround (6,$out,"%r14","%r15"); # KA 576$code.=<<___; 577 lea 128($out),$out # size optimization 578 shl \$32,%r8 # @S[0]|| 579 shl \$32,%r10 # @S[2]|| 580 or %r9,%r8 # ||@S[1] 581 or %r11,%r10 # ||@S[3] 582___ 583 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 584 &_rotl128 ("%rcx","%rdx",15); 585 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 586 &_rotl128 ("%r14","%r15",15); 587 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 588 &_rotl128 ("%rcx","%rdx",15); # 15+15=30 589 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 590 &_rotl128 ("%r8","%r10",30); 591 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 592 &_rotl128 ("%rax","%rbx",45); 593 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 594 &_rotl128 ("%r14","%r15",30); # 15+30=45 595 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 596 &_rotl128 ("%rax","%rbx",15); # 45+15=60 597 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 598 &_rotl128 ("%rcx","%rdx",30); # 30+30=60 599 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 600 &_rotl128 ("%r8","%r10",30); # 30+30=60 601 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 602 &_rotl128 ("%rax","%rbx",17); # 60+17=77 603 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 604 &_rotl128 ("%r14","%r15",32); # 45+32=77 605 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 606 &_rotl128 ("%rcx","%rdx",34); # 60+34=94 607 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 608 &_rotl128 ("%r14","%r15",17); # 77+17=94 609 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 610 &_rotl128 ("%rax","%rbx",34); # 77+34=111 611 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 612 &_rotl128 ("%r8","%r10",51); # 60+51=111 613 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 614$code.=<<___; 615 mov \$4,%eax 616.Ldone: 617 mov 0(%rsp),%r15 618.cfi_restore %r15 619 mov 8(%rsp),%r14 620.cfi_restore %r14 621 mov 16(%rsp),%r13 622.cfi_restore %r13 623 mov 24(%rsp),%rbp 624.cfi_restore %rbp 625 mov 32(%rsp),%rbx 626.cfi_restore %rbx 627 lea 40(%rsp),%rsp 628.cfi_adjust_cfa_offset -40 629.Lkey_epilogue: 630 ret 631.cfi_endproc 632.size Camellia_Ekeygen,.-Camellia_Ekeygen 633___ 634} 635 636@SBOX=( 637112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 638 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 639134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 640166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 641139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 642223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 643 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 644254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 645170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 646 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 647135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 648 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 649233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 650120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 651114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 652 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 653 654sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } 655sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } 656sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } 657sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } 658 659$code.=<<___; 660.align 64 661.LCamellia_SIGMA: 662.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 663.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 664.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 665.long 0, 0, 0, 0 666.LCamellia_SBOX: 667___ 668# tables are interleaved, remember? 669sub data_word { $code.=".long\t".join(',',@_)."\n"; } 670for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 671for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 672 673# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 674# size_t length, const CAMELLIA_KEY *key, 675# unsigned char *ivp,const int enc); 676{ 677$_key="0(%rsp)"; 678$_end="8(%rsp)"; # inp+len&~15 679$_res="16(%rsp)"; # len&15 680$ivec="24(%rsp)"; 681$_ivp="40(%rsp)"; 682$_rsp="48(%rsp)"; 683 684$code.=<<___; 685.globl Camellia_cbc_encrypt 686.type Camellia_cbc_encrypt,\@function,6 687.align 16 688Camellia_cbc_encrypt: 689.cfi_startproc 690 endbranch 691 cmp \$0,%rdx 692 je .Lcbc_abort 693 push %rbx 694.cfi_push %rbx 695 push %rbp 696.cfi_push %rbp 697 push %r12 698.cfi_push %r12 699 push %r13 700.cfi_push %r13 701 push %r14 702.cfi_push %r14 703 push %r15 704.cfi_push %r15 705.Lcbc_prologue: 706 707 mov %rsp,%rbp 708.cfi_def_cfa_register %rbp 709 sub \$64,%rsp 710 and \$-64,%rsp 711 712 # place stack frame just "above mod 1024" the key schedule, 713 # this ensures that cache associativity suffices 714 lea -64-63(%rcx),%r10 715 sub %rsp,%r10 716 neg %r10 717 and \$0x3C0,%r10 718 sub %r10,%rsp 719 #add \$8,%rsp # 8 is reserved for callee's ra 720 721 mov %rdi,$inp # inp argument 722 mov %rsi,$out # out argument 723 mov %r8,%rbx # ivp argument 724 mov %rcx,$key # key argument 725 mov 272(%rcx),${keyend}d # grandRounds 726 727 mov %r8,$_ivp 728 mov %rbp,$_rsp 729.cfi_cfa_expression $_rsp,deref,+56 730 731.Lcbc_body: 732 lea .LCamellia_SBOX(%rip),$Tbl 733 734 mov \$32,%ecx 735.align 4 736.Lcbc_prefetch_sbox: 737 mov 0($Tbl),%rax 738 mov 32($Tbl),%rsi 739 mov 64($Tbl),%rdi 740 mov 96($Tbl),%r11 741 lea 128($Tbl),$Tbl 742 loop .Lcbc_prefetch_sbox 743 sub \$4096,$Tbl 744 shl \$6,$keyend 745 mov %rdx,%rcx # len argument 746 lea ($key,$keyend),$keyend 747 748 cmp \$0,%r9d # enc argument 749 je .LCBC_DECRYPT 750 751 and \$-16,%rdx 752 and \$15,%rcx # length residue 753 lea ($inp,%rdx),%rdx 754 mov $key,$_key 755 mov %rdx,$_end 756 mov %rcx,$_res 757 758 cmp $inp,%rdx 759 mov 0(%rbx),@S[0] # load IV 760 mov 4(%rbx),@S[1] 761 mov 8(%rbx),@S[2] 762 mov 12(%rbx),@S[3] 763 je .Lcbc_enc_tail 764 jmp .Lcbc_eloop 765 766.align 16 767.Lcbc_eloop: 768 xor 0($inp),@S[0] 769 xor 4($inp),@S[1] 770 xor 8($inp),@S[2] 771 bswap @S[0] 772 xor 12($inp),@S[3] 773 bswap @S[1] 774 bswap @S[2] 775 bswap @S[3] 776 777 call _x86_64_Camellia_encrypt 778 779 mov $_key,$key # "rewind" the key 780 bswap @S[0] 781 mov $_end,%rdx 782 bswap @S[1] 783 mov $_res,%rcx 784 bswap @S[2] 785 mov @S[0],0($out) 786 bswap @S[3] 787 mov @S[1],4($out) 788 mov @S[2],8($out) 789 lea 16($inp),$inp 790 mov @S[3],12($out) 791 cmp %rdx,$inp 792 lea 16($out),$out 793 jne .Lcbc_eloop 794 795 cmp \$0,%rcx 796 jne .Lcbc_enc_tail 797 798 mov $_ivp,$out 799 mov @S[0],0($out) # write out IV residue 800 mov @S[1],4($out) 801 mov @S[2],8($out) 802 mov @S[3],12($out) 803 jmp .Lcbc_done 804 805.align 16 806.Lcbc_enc_tail: 807 xor %rax,%rax 808 mov %rax,0+$ivec 809 mov %rax,8+$ivec 810 mov %rax,$_res 811 812.Lcbc_enc_pushf: 813 pushfq 814 cld 815 mov $inp,%rsi 816 lea 8+$ivec,%rdi 817 .long 0x9066A4F3 # rep movsb 818 popfq 819.Lcbc_enc_popf: 820 821 lea $ivec,$inp 822 lea 16+$ivec,%rax 823 mov %rax,$_end 824 jmp .Lcbc_eloop # one more time 825 826.align 16 827.LCBC_DECRYPT: 828 xchg $key,$keyend 829 add \$15,%rdx 830 and \$15,%rcx # length residue 831 and \$-16,%rdx 832 mov $key,$_key 833 lea ($inp,%rdx),%rdx 834 mov %rdx,$_end 835 mov %rcx,$_res 836 837 mov (%rbx),%rax # load IV 838 mov 8(%rbx),%rbx 839 jmp .Lcbc_dloop 840.align 16 841.Lcbc_dloop: 842 mov 0($inp),@S[0] 843 mov 4($inp),@S[1] 844 mov 8($inp),@S[2] 845 bswap @S[0] 846 mov 12($inp),@S[3] 847 bswap @S[1] 848 mov %rax,0+$ivec # save IV to temporary storage 849 bswap @S[2] 850 mov %rbx,8+$ivec 851 bswap @S[3] 852 853 call _x86_64_Camellia_decrypt 854 855 mov $_key,$key # "rewind" the key 856 mov $_end,%rdx 857 mov $_res,%rcx 858 859 bswap @S[0] 860 mov ($inp),%rax # load IV for next iteration 861 bswap @S[1] 862 mov 8($inp),%rbx 863 bswap @S[2] 864 xor 0+$ivec,@S[0] 865 bswap @S[3] 866 xor 4+$ivec,@S[1] 867 xor 8+$ivec,@S[2] 868 lea 16($inp),$inp 869 xor 12+$ivec,@S[3] 870 cmp %rdx,$inp 871 je .Lcbc_ddone 872 873 mov @S[0],0($out) 874 mov @S[1],4($out) 875 mov @S[2],8($out) 876 mov @S[3],12($out) 877 878 lea 16($out),$out 879 jmp .Lcbc_dloop 880 881.align 16 882.Lcbc_ddone: 883 mov $_ivp,%rdx 884 cmp \$0,%rcx 885 jne .Lcbc_dec_tail 886 887 mov @S[0],0($out) 888 mov @S[1],4($out) 889 mov @S[2],8($out) 890 mov @S[3],12($out) 891 892 mov %rax,(%rdx) # write out IV residue 893 mov %rbx,8(%rdx) 894 jmp .Lcbc_done 895.align 16 896.Lcbc_dec_tail: 897 mov @S[0],0+$ivec 898 mov @S[1],4+$ivec 899 mov @S[2],8+$ivec 900 mov @S[3],12+$ivec 901 902.Lcbc_dec_pushf: 903 pushfq 904 cld 905 lea 8+$ivec,%rsi 906 lea ($out),%rdi 907 .long 0x9066A4F3 # rep movsb 908 popfq 909.Lcbc_dec_popf: 910 911 mov %rax,(%rdx) # write out IV residue 912 mov %rbx,8(%rdx) 913 jmp .Lcbc_done 914 915.align 16 916.Lcbc_done: 917 mov $_rsp,%rcx 918.cfi_def_cfa %rcx,56 919 mov 0(%rcx),%r15 920.cfi_restore %r15 921 mov 8(%rcx),%r14 922.cfi_restore %r14 923 mov 16(%rcx),%r13 924.cfi_restore %r13 925 mov 24(%rcx),%r12 926.cfi_restore %r12 927 mov 32(%rcx),%rbp 928.cfi_restore %rbp 929 mov 40(%rcx),%rbx 930.cfi_restore %rbx 931 lea 48(%rcx),%rsp 932.cfi_def_cfa %rsp,8 933.Lcbc_abort: 934 ret 935.cfi_endproc 936.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt 937 938.asciz "Camellia for x86_64 by <appro\@openssl.org>" 939___ 940} 941 942# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 943# CONTEXT *context,DISPATCHER_CONTEXT *disp) 944if ($win64) { 945$rec="%rcx"; 946$frame="%rdx"; 947$context="%r8"; 948$disp="%r9"; 949 950$code.=<<___; 951.extern __imp_RtlVirtualUnwind 952.type common_se_handler,\@abi-omnipotent 953.align 16 954common_se_handler: 955 push %rsi 956 push %rdi 957 push %rbx 958 push %rbp 959 push %r12 960 push %r13 961 push %r14 962 push %r15 963 pushfq 964 lea -64(%rsp),%rsp 965 966 mov 120($context),%rax # pull context->Rax 967 mov 248($context),%rbx # pull context->Rip 968 969 mov 8($disp),%rsi # disp->ImageBase 970 mov 56($disp),%r11 # disp->HandlerData 971 972 mov 0(%r11),%r10d # HandlerData[0] 973 lea (%rsi,%r10),%r10 # prologue label 974 cmp %r10,%rbx # context->Rip<prologue label 975 jb .Lin_prologue 976 977 mov 152($context),%rax # pull context->Rsp 978 979 mov 4(%r11),%r10d # HandlerData[1] 980 lea (%rsi,%r10),%r10 # epilogue label 981 cmp %r10,%rbx # context->Rip>=epilogue label 982 jae .Lin_prologue 983 984 lea 40(%rax),%rax 985 mov -8(%rax),%rbx 986 mov -16(%rax),%rbp 987 mov -24(%rax),%r13 988 mov -32(%rax),%r14 989 mov -40(%rax),%r15 990 mov %rbx,144($context) # restore context->Rbx 991 mov %rbp,160($context) # restore context->Rbp 992 mov %r13,224($context) # restore context->R13 993 mov %r14,232($context) # restore context->R14 994 mov %r15,240($context) # restore context->R15 995 996.Lin_prologue: 997 mov 8(%rax),%rdi 998 mov 16(%rax),%rsi 999 mov %rax,152($context) # restore context->Rsp 1000 mov %rsi,168($context) # restore context->Rsi 1001 mov %rdi,176($context) # restore context->Rdi 1002 1003 jmp .Lcommon_seh_exit 1004.size common_se_handler,.-common_se_handler 1005 1006.type cbc_se_handler,\@abi-omnipotent 1007.align 16 1008cbc_se_handler: 1009 push %rsi 1010 push %rdi 1011 push %rbx 1012 push %rbp 1013 push %r12 1014 push %r13 1015 push %r14 1016 push %r15 1017 pushfq 1018 lea -64(%rsp),%rsp 1019 1020 mov 120($context),%rax # pull context->Rax 1021 mov 248($context),%rbx # pull context->Rip 1022 1023 lea .Lcbc_prologue(%rip),%r10 1024 cmp %r10,%rbx # context->Rip<.Lcbc_prologue 1025 jb .Lin_cbc_prologue 1026 1027 lea .Lcbc_body(%rip),%r10 1028 cmp %r10,%rbx # context->Rip<.Lcbc_body 1029 jb .Lin_cbc_frame_setup 1030 1031 mov 152($context),%rax # pull context->Rsp 1032 1033 lea .Lcbc_abort(%rip),%r10 1034 cmp %r10,%rbx # context->Rip>=.Lcbc_abort 1035 jae .Lin_cbc_prologue 1036 1037 # handle pushf/popf in Camellia_cbc_encrypt 1038 lea .Lcbc_enc_pushf(%rip),%r10 1039 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf 1040 jbe .Lin_cbc_no_flag 1041 lea 8(%rax),%rax 1042 lea .Lcbc_enc_popf(%rip),%r10 1043 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf 1044 jb .Lin_cbc_no_flag 1045 lea -8(%rax),%rax 1046 lea .Lcbc_dec_pushf(%rip),%r10 1047 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf 1048 jbe .Lin_cbc_no_flag 1049 lea 8(%rax),%rax 1050 lea .Lcbc_dec_popf(%rip),%r10 1051 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf 1052 jb .Lin_cbc_no_flag 1053 lea -8(%rax),%rax 1054 1055.Lin_cbc_no_flag: 1056 mov 48(%rax),%rax # $_rsp 1057 lea 48(%rax),%rax 1058 1059.Lin_cbc_frame_setup: 1060 mov -8(%rax),%rbx 1061 mov -16(%rax),%rbp 1062 mov -24(%rax),%r12 1063 mov -32(%rax),%r13 1064 mov -40(%rax),%r14 1065 mov -48(%rax),%r15 1066 mov %rbx,144($context) # restore context->Rbx 1067 mov %rbp,160($context) # restore context->Rbp 1068 mov %r12,216($context) # restore context->R12 1069 mov %r13,224($context) # restore context->R13 1070 mov %r14,232($context) # restore context->R14 1071 mov %r15,240($context) # restore context->R15 1072 1073.Lin_cbc_prologue: 1074 mov 8(%rax),%rdi 1075 mov 16(%rax),%rsi 1076 mov %rax,152($context) # restore context->Rsp 1077 mov %rsi,168($context) # restore context->Rsi 1078 mov %rdi,176($context) # restore context->Rdi 1079 1080.align 4 1081.Lcommon_seh_exit: 1082 1083 mov 40($disp),%rdi # disp->ContextRecord 1084 mov $context,%rsi # context 1085 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1086 .long 0xa548f3fc # cld; rep movsq 1087 1088 mov $disp,%rsi 1089 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1090 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1091 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1092 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1093 mov 40(%rsi),%r10 # disp->ContextRecord 1094 lea 56(%rsi),%r11 # &disp->HandlerData 1095 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1096 mov %r10,32(%rsp) # arg5 1097 mov %r11,40(%rsp) # arg6 1098 mov %r12,48(%rsp) # arg7 1099 mov %rcx,56(%rsp) # arg8, (NULL) 1100 call *__imp_RtlVirtualUnwind(%rip) 1101 1102 mov \$1,%eax # ExceptionContinueSearch 1103 lea 64(%rsp),%rsp 1104 popfq 1105 pop %r15 1106 pop %r14 1107 pop %r13 1108 pop %r12 1109 pop %rbp 1110 pop %rbx 1111 pop %rdi 1112 pop %rsi 1113 ret 1114.size cbc_se_handler,.-cbc_se_handler 1115 1116.section .pdata 1117.align 4 1118 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds 1119 .rva .LSEH_end_Camellia_EncryptBlock_Rounds 1120 .rva .LSEH_info_Camellia_EncryptBlock_Rounds 1121 1122 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds 1123 .rva .LSEH_end_Camellia_DecryptBlock_Rounds 1124 .rva .LSEH_info_Camellia_DecryptBlock_Rounds 1125 1126 .rva .LSEH_begin_Camellia_Ekeygen 1127 .rva .LSEH_end_Camellia_Ekeygen 1128 .rva .LSEH_info_Camellia_Ekeygen 1129 1130 .rva .LSEH_begin_Camellia_cbc_encrypt 1131 .rva .LSEH_end_Camellia_cbc_encrypt 1132 .rva .LSEH_info_Camellia_cbc_encrypt 1133 1134.section .xdata 1135.align 8 1136.LSEH_info_Camellia_EncryptBlock_Rounds: 1137 .byte 9,0,0,0 1138 .rva common_se_handler 1139 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 1140.LSEH_info_Camellia_DecryptBlock_Rounds: 1141 .byte 9,0,0,0 1142 .rva common_se_handler 1143 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 1144.LSEH_info_Camellia_Ekeygen: 1145 .byte 9,0,0,0 1146 .rva common_se_handler 1147 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] 1148.LSEH_info_Camellia_cbc_encrypt: 1149 .byte 9,0,0,0 1150 .rva cbc_se_handler 1151___ 1152} 1153 1154$code =~ s/\`([^\`]*)\`/eval $1/gem; 1155print $code; 1156close STDOUT or die "error closing STDOUT: $!"; 1157