1#! /usr/bin/env perl 2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12# 13# This module may be used under the terms of either the GNU General 14# Public License version 2 or later, the GNU Lesser General Public 15# License version 2.1 or later, the Mozilla Public License version 16# 1.1 or the BSD License. The exact terms of either license are 17# distributed along with this module. For further details see 18# http://www.openssl.org/~appro/camellia/. 19# ==================================================================== 20 21# Performance in cycles per processed byte (less is better) in 22# 'openssl speed ...' benchmark: 23# 24# AMD K8 Core2 PIII P4 25# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 26# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% 27# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% 28# 29# camellia-128-cbc 17.3 21.1 23.9 25.9 30# 31# 128-bit key setup 196 280 256 240 cycles/key 32# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% 33# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% 34# 35# Pairs of numbers in "+" rows represent performance improvement over 36# compiler generated position-independent code, PIC, and non-PIC 37# respectively. PIC results are of greater relevance, as this module 38# is position-independent, i.e. suitable for a shared library or PIE. 39# Position independence "costs" one register, which is why compilers 40# are so close with non-PIC results, they have an extra register to 41# spare. CBC results are better than ECB ones thanks to "zero-copy" 42# private _x86_* interface, and are ~30-40% better than with compiler 43# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on 44# same CPU (where applicable). 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47push(@INC,"${dir}","${dir}../../perlasm"); 48require "x86asm.pl"; 49 50$OPENSSL=1; 51 52$output = pop; 53open STDOUT,">$output"; 54 55&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 56 57@T=("eax","ebx","ecx","edx"); 58$idx="esi"; 59$key="edi"; 60$Tbl="ebp"; 61 62# stack frame layout in _x86_Camellia_* routines, frame is allocated 63# by caller 64$__ra=&DWP(0,"esp"); # return address 65$__s0=&DWP(4,"esp"); # s0 backing store 66$__s1=&DWP(8,"esp"); # s1 backing store 67$__s2=&DWP(12,"esp"); # s2 backing store 68$__s3=&DWP(16,"esp"); # s3 backing store 69$__end=&DWP(20,"esp"); # pointer to end/start of key schedule 70 71# stack frame layout in Camellia_[en|crypt] routines, which differs from 72# above by 4 and overlaps by pointer to end/start of key schedule 73$_end=&DWP(16,"esp"); 74$_esp=&DWP(20,"esp"); 75 76# const unsigned int Camellia_SBOX[4][256]; 77# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 78# and [2][] - with [3][]. This is done to optimize code size. 79$SBOX1_1110=0; # Camellia_SBOX[0] 80$SBOX4_4404=4; # Camellia_SBOX[1] 81$SBOX2_0222=2048; # Camellia_SBOX[2] 82$SBOX3_3033=2052; # Camellia_SBOX[3] 83&static_label("Camellia_SIGMA"); 84&static_label("Camellia_SBOX"); 85 86sub Camellia_Feistel { 87my $i=@_[0]; 88my $seed=defined(@_[1])?@_[1]:0; 89my $scale=$seed<0?-8:8; 90my $frame=defined(@_[2])?@_[2]:0; 91my $j=($i&1)*2; 92my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; 93 94 &xor ($t0,$idx); # t0^=key[0] 95 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] 96 &movz ($idx,&HB($t0)); # (t0>>8)&0xff 97 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] 98 &movz ($idx,&LB($t0)); # (t0>>0)&0xff 99 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] 100 &shr ($t0,16); 101 &movz ($idx,&LB($t1)); # (t1>>0)&0xff 102 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] 103 &movz ($idx,&HB($t0)); # (t0>>24)&0xff 104 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] 105 &movz ($idx,&HB($t1)); # (t1>>8)&0xff 106 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] 107 &shr ($t1,16); 108 &movz ($t0,&LB($t0)); # (t0>>16)&0xff 109 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] 110 &movz ($idx,&HB($t1)); # (t1>>24)&0xff 111 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" 112 &xor ($t2,$t3); # t2^=t3 113 &rotr ($t3,8); # t3=RightRotate(t3,8) 114 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] 115 &movz ($idx,&LB($t1)); # (t1>>16)&0xff 116 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" 117 &xor ($t3,$t0); # t3^=s3 118 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] 119 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] 120 &xor ($t3,$t2); # t3^=t2 121 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 122 &xor ($t2,$t1); # t2^=s2 123 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 124} 125 126# void Camellia_EncryptBlock_Rounds( 127# int grandRounds, 128# const Byte plaintext[], 129# const KEY_TABLE_TYPE keyTable, 130# Byte ciphertext[]) 131&function_begin("Camellia_EncryptBlock_Rounds"); 132 &mov ("eax",&wparam(0)); # load grandRounds 133 &mov ($idx,&wparam(1)); # load plaintext pointer 134 &mov ($key,&wparam(2)); # load key schedule pointer 135 136 &mov ("ebx","esp"); 137 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 138 &and ("esp",-64); 139 140 # place stack frame just "above mod 1024" the key schedule 141 # this ensures that cache associativity of 2 suffices 142 &lea ("ecx",&DWP(-64-63,$key)); 143 &sub ("ecx","esp"); 144 &neg ("ecx"); 145 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 146 &sub ("esp","ecx"); 147 &add ("esp",4); # 4 is reserved for callee's return address 148 149 &shl ("eax",6); 150 &lea ("eax",&DWP(0,$key,"eax")); 151 &mov ($_esp,"ebx"); # save %esp 152 &mov ($_end,"eax"); # save keyEnd 153 154 &call (&label("pic_point")); 155 &set_label("pic_point"); 156 &blindpop($Tbl); 157 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 158 159 &mov (@T[0],&DWP(0,$idx)); # load plaintext 160 &mov (@T[1],&DWP(4,$idx)); 161 &mov (@T[2],&DWP(8,$idx)); 162 &bswap (@T[0]); 163 &mov (@T[3],&DWP(12,$idx)); 164 &bswap (@T[1]); 165 &bswap (@T[2]); 166 &bswap (@T[3]); 167 168 &call ("_x86_Camellia_encrypt"); 169 170 &mov ("esp",$_esp); 171 &bswap (@T[0]); 172 &mov ($idx,&wparam(3)); # load ciphertext pointer 173 &bswap (@T[1]); 174 &bswap (@T[2]); 175 &bswap (@T[3]); 176 &mov (&DWP(0,$idx),@T[0]); # write ciphertext 177 &mov (&DWP(4,$idx),@T[1]); 178 &mov (&DWP(8,$idx),@T[2]); 179 &mov (&DWP(12,$idx),@T[3]); 180&function_end("Camellia_EncryptBlock_Rounds"); 181# V1.x API 182&function_begin_B("Camellia_EncryptBlock"); 183 &mov ("eax",128); 184 &sub ("eax",&wparam(0)); # load keyBitLength 185 &mov ("eax",3); 186 &adc ("eax",0); # keyBitLength==128?3:4 187 &mov (&wparam(0),"eax"); 188 &jmp (&label("Camellia_EncryptBlock_Rounds")); 189&function_end_B("Camellia_EncryptBlock"); 190 191if ($OPENSSL) { 192# void Camellia_encrypt( 193# const unsigned char *in, 194# unsigned char *out, 195# const CAMELLIA_KEY *key) 196&function_begin("Camellia_encrypt"); 197 &mov ($idx,&wparam(0)); # load plaintext pointer 198 &mov ($key,&wparam(2)); # load key schedule pointer 199 200 &mov ("ebx","esp"); 201 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 202 &and ("esp",-64); 203 &mov ("eax",&DWP(272,$key)); # load grandRounds counter 204 205 # place stack frame just "above mod 1024" the key schedule 206 # this ensures that cache associativity of 2 suffices 207 &lea ("ecx",&DWP(-64-63,$key)); 208 &sub ("ecx","esp"); 209 &neg ("ecx"); 210 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 211 &sub ("esp","ecx"); 212 &add ("esp",4); # 4 is reserved for callee's return address 213 214 &shl ("eax",6); 215 &lea ("eax",&DWP(0,$key,"eax")); 216 &mov ($_esp,"ebx"); # save %esp 217 &mov ($_end,"eax"); # save keyEnd 218 219 &call (&label("pic_point")); 220 &set_label("pic_point"); 221 &blindpop($Tbl); 222 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 223 224 &mov (@T[0],&DWP(0,$idx)); # load plaintext 225 &mov (@T[1],&DWP(4,$idx)); 226 &mov (@T[2],&DWP(8,$idx)); 227 &bswap (@T[0]); 228 &mov (@T[3],&DWP(12,$idx)); 229 &bswap (@T[1]); 230 &bswap (@T[2]); 231 &bswap (@T[3]); 232 233 &call ("_x86_Camellia_encrypt"); 234 235 &mov ("esp",$_esp); 236 &bswap (@T[0]); 237 &mov ($idx,&wparam(1)); # load ciphertext pointer 238 &bswap (@T[1]); 239 &bswap (@T[2]); 240 &bswap (@T[3]); 241 &mov (&DWP(0,$idx),@T[0]); # write ciphertext 242 &mov (&DWP(4,$idx),@T[1]); 243 &mov (&DWP(8,$idx),@T[2]); 244 &mov (&DWP(12,$idx),@T[3]); 245&function_end("Camellia_encrypt"); 246} 247 248&function_begin_B("_x86_Camellia_encrypt"); 249 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 250 &xor (@T[1],&DWP(4,$key)); 251 &xor (@T[2],&DWP(8,$key)); 252 &xor (@T[3],&DWP(12,$key)); 253 &mov ($idx,&DWP(16,$key)); # prefetch key[4] 254 255 &mov ($__s0,@T[0]); # save s[0-3] 256 &mov ($__s1,@T[1]); 257 &mov ($__s2,@T[2]); 258 &mov ($__s3,@T[3]); 259 260&set_label("loop",16); 261 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } 262 263 &add ($key,16*4); 264 &cmp ($key,$__end); 265 &je (&label("done")); 266 267 # @T[0-1] are preloaded, $idx is preloaded with key[0] 268 &and ($idx,@T[0]); 269 &mov (@T[3],$__s3); 270 &rotl ($idx,1); 271 &mov (@T[2],@T[3]); 272 &xor (@T[1],$idx); 273 &or (@T[2],&DWP(12,$key)); 274 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 275 &xor (@T[2],$__s2); 276 277 &mov ($idx,&DWP(4,$key)); 278 &mov ($__s2,@T[2]); # s2^=s3|key[3]; 279 &or ($idx,@T[1]); 280 &and (@T[2],&DWP(8,$key)); 281 &xor (@T[0],$idx); 282 &rotl (@T[2],1); 283 &mov ($__s0,@T[0]); # s0^=s1|key[1]; 284 &xor (@T[3],@T[2]); 285 &mov ($idx,&DWP(16,$key)); # prefetch key[4] 286 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 287 &jmp (&label("loop")); 288 289&set_label("done",8); 290 &mov (@T[2],@T[0]); # SwapHalf 291 &mov (@T[3],@T[1]); 292 &mov (@T[0],$__s2); 293 &mov (@T[1],$__s3); 294 &xor (@T[0],$idx); # $idx is preloaded with key[0] 295 &xor (@T[1],&DWP(4,$key)); 296 &xor (@T[2],&DWP(8,$key)); 297 &xor (@T[3],&DWP(12,$key)); 298 &ret (); 299&function_end_B("_x86_Camellia_encrypt"); 300 301# void Camellia_DecryptBlock_Rounds( 302# int grandRounds, 303# const Byte ciphertext[], 304# const KEY_TABLE_TYPE keyTable, 305# Byte plaintext[]) 306&function_begin("Camellia_DecryptBlock_Rounds"); 307 &mov ("eax",&wparam(0)); # load grandRounds 308 &mov ($idx,&wparam(1)); # load ciphertext pointer 309 &mov ($key,&wparam(2)); # load key schedule pointer 310 311 &mov ("ebx","esp"); 312 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 313 &and ("esp",-64); 314 315 # place stack frame just "above mod 1024" the key schedule 316 # this ensures that cache associativity of 2 suffices 317 &lea ("ecx",&DWP(-64-63,$key)); 318 &sub ("ecx","esp"); 319 &neg ("ecx"); 320 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 321 &sub ("esp","ecx"); 322 &add ("esp",4); # 4 is reserved for callee's return address 323 324 &shl ("eax",6); 325 &mov (&DWP(4*4,"esp"),$key); # save keyStart 326 &lea ($key,&DWP(0,$key,"eax")); 327 &mov (&DWP(5*4,"esp"),"ebx");# save %esp 328 329 &call (&label("pic_point")); 330 &set_label("pic_point"); 331 &blindpop($Tbl); 332 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 333 334 &mov (@T[0],&DWP(0,$idx)); # load ciphertext 335 &mov (@T[1],&DWP(4,$idx)); 336 &mov (@T[2],&DWP(8,$idx)); 337 &bswap (@T[0]); 338 &mov (@T[3],&DWP(12,$idx)); 339 &bswap (@T[1]); 340 &bswap (@T[2]); 341 &bswap (@T[3]); 342 343 &call ("_x86_Camellia_decrypt"); 344 345 &mov ("esp",&DWP(5*4,"esp")); 346 &bswap (@T[0]); 347 &mov ($idx,&wparam(3)); # load plaintext pointer 348 &bswap (@T[1]); 349 &bswap (@T[2]); 350 &bswap (@T[3]); 351 &mov (&DWP(0,$idx),@T[0]); # write plaintext 352 &mov (&DWP(4,$idx),@T[1]); 353 &mov (&DWP(8,$idx),@T[2]); 354 &mov (&DWP(12,$idx),@T[3]); 355&function_end("Camellia_DecryptBlock_Rounds"); 356# V1.x API 357&function_begin_B("Camellia_DecryptBlock"); 358 &mov ("eax",128); 359 &sub ("eax",&wparam(0)); # load keyBitLength 360 &mov ("eax",3); 361 &adc ("eax",0); # keyBitLength==128?3:4 362 &mov (&wparam(0),"eax"); 363 &jmp (&label("Camellia_DecryptBlock_Rounds")); 364&function_end_B("Camellia_DecryptBlock"); 365 366if ($OPENSSL) { 367# void Camellia_decrypt( 368# const unsigned char *in, 369# unsigned char *out, 370# const CAMELLIA_KEY *key) 371&function_begin("Camellia_decrypt"); 372 &mov ($idx,&wparam(0)); # load ciphertext pointer 373 &mov ($key,&wparam(2)); # load key schedule pointer 374 375 &mov ("ebx","esp"); 376 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 377 &and ("esp",-64); 378 &mov ("eax",&DWP(272,$key)); # load grandRounds counter 379 380 # place stack frame just "above mod 1024" the key schedule 381 # this ensures that cache associativity of 2 suffices 382 &lea ("ecx",&DWP(-64-63,$key)); 383 &sub ("ecx","esp"); 384 &neg ("ecx"); 385 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 386 &sub ("esp","ecx"); 387 &add ("esp",4); # 4 is reserved for callee's return address 388 389 &shl ("eax",6); 390 &mov (&DWP(4*4,"esp"),$key); # save keyStart 391 &lea ($key,&DWP(0,$key,"eax")); 392 &mov (&DWP(5*4,"esp"),"ebx");# save %esp 393 394 &call (&label("pic_point")); 395 &set_label("pic_point"); 396 &blindpop($Tbl); 397 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 398 399 &mov (@T[0],&DWP(0,$idx)); # load ciphertext 400 &mov (@T[1],&DWP(4,$idx)); 401 &mov (@T[2],&DWP(8,$idx)); 402 &bswap (@T[0]); 403 &mov (@T[3],&DWP(12,$idx)); 404 &bswap (@T[1]); 405 &bswap (@T[2]); 406 &bswap (@T[3]); 407 408 &call ("_x86_Camellia_decrypt"); 409 410 &mov ("esp",&DWP(5*4,"esp")); 411 &bswap (@T[0]); 412 &mov ($idx,&wparam(1)); # load plaintext pointer 413 &bswap (@T[1]); 414 &bswap (@T[2]); 415 &bswap (@T[3]); 416 &mov (&DWP(0,$idx),@T[0]); # write plaintext 417 &mov (&DWP(4,$idx),@T[1]); 418 &mov (&DWP(8,$idx),@T[2]); 419 &mov (&DWP(12,$idx),@T[3]); 420&function_end("Camellia_decrypt"); 421} 422 423&function_begin_B("_x86_Camellia_decrypt"); 424 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 425 &xor (@T[1],&DWP(4,$key)); 426 &xor (@T[2],&DWP(8,$key)); 427 &xor (@T[3],&DWP(12,$key)); 428 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] 429 430 &mov ($__s0,@T[0]); # save s[0-3] 431 &mov ($__s1,@T[1]); 432 &mov ($__s2,@T[2]); 433 &mov ($__s3,@T[3]); 434 435&set_label("loop",16); 436 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } 437 438 &sub ($key,16*4); 439 &cmp ($key,$__end); 440 &je (&label("done")); 441 442 # @T[0-1] are preloaded, $idx is preloaded with key[2] 443 &and ($idx,@T[0]); 444 &mov (@T[3],$__s3); 445 &rotl ($idx,1); 446 &mov (@T[2],@T[3]); 447 &xor (@T[1],$idx); 448 &or (@T[2],&DWP(4,$key)); 449 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 450 &xor (@T[2],$__s2); 451 452 &mov ($idx,&DWP(12,$key)); 453 &mov ($__s2,@T[2]); # s2^=s3|key[3]; 454 &or ($idx,@T[1]); 455 &and (@T[2],&DWP(0,$key)); 456 &xor (@T[0],$idx); 457 &rotl (@T[2],1); 458 &mov ($__s0,@T[0]); # s0^=s1|key[1]; 459 &xor (@T[3],@T[2]); 460 &mov ($idx,&DWP(-8,$key)); # prefetch key[4] 461 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 462 &jmp (&label("loop")); 463 464&set_label("done",8); 465 &mov (@T[2],@T[0]); # SwapHalf 466 &mov (@T[3],@T[1]); 467 &mov (@T[0],$__s2); 468 &mov (@T[1],$__s3); 469 &xor (@T[2],$idx); # $idx is preloaded with key[2] 470 &xor (@T[3],&DWP(12,$key)); 471 &xor (@T[0],&DWP(0,$key)); 472 &xor (@T[1],&DWP(4,$key)); 473 &ret (); 474&function_end_B("_x86_Camellia_decrypt"); 475 476# shld is very slow on Intel P4 family. Even on AMD it limits 477# instruction decode rate [because it's VectorPath] and consequently 478# performance. PIII, PM and Core[2] seem to be the only ones which 479# execute this code ~7% faster... 480sub __rotl128 { 481 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 482 483 $rnd *= 2; 484 if ($rot) { 485 &mov ($idx,$i0); 486 &shld ($i0,$i1,$rot); 487 &shld ($i1,$i2,$rot); 488 &shld ($i2,$i3,$rot); 489 &shld ($i3,$idx,$rot); 490 } 491 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 492 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 493 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 494 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 495} 496 497# ... Implementing 128-bit rotate without shld gives >3x performance 498# improvement on P4, only ~7% degradation on other Intel CPUs and 499# not worse performance on AMD. This is therefore preferred. 500sub _rotl128 { 501 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 502 503 $rnd *= 2; 504 if ($rot) { 505 &mov ($Tbl,$i0); 506 &shl ($i0,$rot); 507 &mov ($idx,$i1); 508 &shr ($idx,32-$rot); 509 &shl ($i1,$rot); 510 &or ($i0,$idx); 511 &mov ($idx,$i2); 512 &shl ($i2,$rot); 513 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 514 &shr ($idx,32-$rot); 515 &or ($i1,$idx); 516 &shr ($Tbl,32-$rot); 517 &mov ($idx,$i3); 518 &shr ($idx,32-$rot); 519 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 520 &shl ($i3,$rot); 521 &or ($i2,$idx); 522 &or ($i3,$Tbl); 523 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 524 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 525 } else { 526 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 527 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 528 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 529 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 530 } 531} 532 533sub _saveround { 534my ($rnd,$key,@T)=@_; 535my $bias=int(@T[0])?shift(@T):0; 536 537 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); 538 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); 539 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); 540 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); 541} 542 543sub _loadround { 544my ($rnd,$key,@T)=@_; 545my $bias=int(@T[0])?shift(@T):0; 546 547 &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); 548 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); 549 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); 550 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); 551} 552 553# void Camellia_Ekeygen( 554# const int keyBitLength, 555# const Byte *rawKey, 556# KEY_TABLE_TYPE keyTable) 557&function_begin("Camellia_Ekeygen"); 558{ my $step=0; 559 560 &stack_push(4); # place for s[0-3] 561 562 &mov ($Tbl,&wparam(0)); # load arguments 563 &mov ($idx,&wparam(1)); 564 &mov ($key,&wparam(2)); 565 566 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits 567 &mov (@T[1],&DWP(4,$idx)); 568 &mov (@T[2],&DWP(8,$idx)); 569 &mov (@T[3],&DWP(12,$idx)); 570 571 &bswap (@T[0]); 572 &bswap (@T[1]); 573 &bswap (@T[2]); 574 &bswap (@T[3]); 575 576 &_saveround (0,$key,@T); # KL<<<0 577 578 &cmp ($Tbl,128); 579 &je (&label("1st128")); 580 581 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits 582 &mov (@T[1],&DWP(20,$idx)); 583 &cmp ($Tbl,192); 584 &je (&label("1st192")); 585 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits 586 &mov (@T[3],&DWP(28,$idx)); 587 &jmp (&label("1st256")); 588&set_label("1st192",4); 589 &mov (@T[2],@T[0]); 590 &mov (@T[3],@T[1]); 591 ¬ (@T[2]); 592 ¬ (@T[3]); 593&set_label("1st256",4); 594 &bswap (@T[0]); 595 &bswap (@T[1]); 596 &bswap (@T[2]); 597 &bswap (@T[3]); 598 599 &_saveround (4,$key,@T); # temporary storage for KR! 600 601 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL 602 &xor (@T[1],&DWP(0*8+4,$key)); 603 &xor (@T[2],&DWP(1*8+0,$key)); 604 &xor (@T[3],&DWP(1*8+4,$key)); 605 606&set_label("1st128",4); 607 &call (&label("pic_point")); 608 &set_label("pic_point"); 609 &blindpop($Tbl); 610 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 611 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); 612 613 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] 614 &mov (&swtmp(0),@T[0]); # save s[0-3] 615 &mov (&swtmp(1),@T[1]); 616 &mov (&swtmp(2),@T[2]); 617 &mov (&swtmp(3),@T[3]); 618 &Camellia_Feistel($step++); 619 &Camellia_Feistel($step++); 620 &mov (@T[2],&swtmp(2)); 621 &mov (@T[3],&swtmp(3)); 622 623 &mov ($idx,&wparam(2)); 624 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL 625 &xor (@T[1],&DWP(0*8+4,$idx)); 626 &xor (@T[2],&DWP(1*8+0,$idx)); 627 &xor (@T[3],&DWP(1*8+4,$idx)); 628 629 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] 630 &mov (&swtmp(0),@T[0]); # save s[0-3] 631 &mov (&swtmp(1),@T[1]); 632 &mov (&swtmp(2),@T[2]); 633 &mov (&swtmp(3),@T[3]); 634 &Camellia_Feistel($step++); 635 &Camellia_Feistel($step++); 636 &mov (@T[2],&swtmp(2)); 637 &mov (@T[3],&swtmp(3)); 638 639 &mov ($idx,&wparam(0)); 640 &cmp ($idx,128); 641 &jne (&label("2nd256")); 642 643 &mov ($key,&wparam(2)); 644 &lea ($key,&DWP(128,$key)); # size optimization 645 646 ####### process KA 647 &_saveround (2,$key,-128,@T); # KA<<<0 648 &_rotl128 (@T,15,6,@T); # KA<<<15 649 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) 650 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) 651 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) 652 push (@T,shift(@T)); # rotl128(@T,32); 653 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) 654 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) 655 656 ####### process KL 657 &_loadround (0,$key,-128,@T); # load KL 658 &_rotl128 (@T,15,4,@T); # KL<<<15 659 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) 660 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) 661 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) 662 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) 663 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) 664 665 while (@T[0] ne "eax") # restore order 666 { unshift (@T,pop(@T)); } 667 668 &mov ("eax",3); # 3 grandRounds 669 &jmp (&label("done")); 670 671&set_label("2nd256",16); 672 &mov ($idx,&wparam(2)); 673 &_saveround (6,$idx,@T); # temporary storage for KA! 674 675 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR 676 &xor (@T[1],&DWP(4*8+4,$idx)); 677 &xor (@T[2],&DWP(5*8+0,$idx)); 678 &xor (@T[3],&DWP(5*8+4,$idx)); 679 680 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] 681 &mov (&swtmp(0),@T[0]); # save s[0-3] 682 &mov (&swtmp(1),@T[1]); 683 &mov (&swtmp(2),@T[2]); 684 &mov (&swtmp(3),@T[3]); 685 &Camellia_Feistel($step++); 686 &Camellia_Feistel($step++); 687 &mov (@T[2],&swtmp(2)); 688 &mov (@T[3],&swtmp(3)); 689 690 &mov ($key,&wparam(2)); 691 &lea ($key,&DWP(128,$key)); # size optimization 692 693 ####### process KB 694 &_saveround (2,$key,-128,@T); # KB<<<0 695 &_rotl128 (@T,30,10,@T); # KB<<<30 696 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) 697 push (@T,shift(@T)); # rotl128(@T,32); 698 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) 699 700 ####### process KR 701 &_loadround (4,$key,-128,@T); # load KR 702 &_rotl128 (@T,15,4,@T); # KR<<<15 703 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) 704 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) 705 push (@T,shift(@T)); # rotl128(@T,32); 706 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) 707 708 ####### process KA 709 &_loadround (6,$key,-128,@T); # load KA 710 &_rotl128 (@T,15,6,@T); # KA<<<15 711 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) 712 push (@T,shift(@T)); # rotl128(@T,32); 713 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) 714 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) 715 716 ####### process KL 717 &_loadround (0,$key,-128,@T); # load KL 718 push (@T,shift(@T)); # rotl128(@T,32); 719 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) 720 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) 721 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) 722 push (@T,shift(@T)); # rotl128(@T,32); 723 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) 724 725 while (@T[0] ne "eax") # restore order 726 { unshift (@T,pop(@T)); } 727 728 &mov ("eax",4); # 4 grandRounds 729&set_label("done"); 730 &lea ("edx",&DWP(272-128,$key)); # end of key schedule 731 &stack_pop(4); 732} 733&function_end("Camellia_Ekeygen"); 734 735if ($OPENSSL) { 736# int Camellia_set_key ( 737# const unsigned char *userKey, 738# int bits, 739# CAMELLIA_KEY *key) 740&function_begin_B("Camellia_set_key"); 741 &push ("ebx"); 742 &mov ("ecx",&wparam(0)); # pull arguments 743 &mov ("ebx",&wparam(1)); 744 &mov ("edx",&wparam(2)); 745 746 &mov ("eax",-1); 747 &test ("ecx","ecx"); 748 &jz (&label("done")); # userKey==NULL? 749 &test ("edx","edx"); 750 &jz (&label("done")); # key==NULL? 751 752 &mov ("eax",-2); 753 &cmp ("ebx",256); 754 &je (&label("arg_ok")); # bits==256? 755 &cmp ("ebx",192); 756 &je (&label("arg_ok")); # bits==192? 757 &cmp ("ebx",128); 758 &jne (&label("done")); # bits!=128? 759&set_label("arg_ok",4); 760 761 &push ("edx"); # push arguments 762 &push ("ecx"); 763 &push ("ebx"); 764 &call ("Camellia_Ekeygen"); 765 &stack_pop(3); 766 767 # eax holds grandRounds and edx points at where to put it 768 &mov (&DWP(0,"edx"),"eax"); 769 &xor ("eax","eax"); 770&set_label("done",4); 771 &pop ("ebx"); 772 &ret (); 773&function_end_B("Camellia_set_key"); 774} 775 776@SBOX=( 777112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 778 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 779134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 780166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 781139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 782223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 783 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 784254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 785170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 786 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 787135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 788 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 789233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 790120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 791114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 792 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 793 794sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } 795sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } 796sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } 797sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } 798 799&set_label("Camellia_SIGMA",64); 800&data_word( 801 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, 802 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, 803 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, 804 0, 0, 0, 0); 805&set_label("Camellia_SBOX",64); 806# tables are interleaved, remember? 807for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 808for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 809 810# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 811# size_t length, const CAMELLIA_KEY *key, 812# unsigned char *ivp,const int enc); 813{ 814# stack frame layout 815# -4(%esp) # return address 0(%esp) 816# 0(%esp) # s0 4(%esp) 817# 4(%esp) # s1 8(%esp) 818# 8(%esp) # s2 12(%esp) 819# 12(%esp) # s3 16(%esp) 820# 16(%esp) # end of key schedule 20(%esp) 821# 20(%esp) # %esp backup 822my $_inp=&DWP(24,"esp"); #copy of wparam(0) 823my $_out=&DWP(28,"esp"); #copy of wparam(1) 824my $_len=&DWP(32,"esp"); #copy of wparam(2) 825my $_key=&DWP(36,"esp"); #copy of wparam(3) 826my $_ivp=&DWP(40,"esp"); #copy of wparam(4) 827my $ivec=&DWP(44,"esp"); #ivec[16] 828my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] 829my ($s0,$s1,$s2,$s3) = @T; 830 831&function_begin("Camellia_cbc_encrypt"); 832 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 833 &cmp ($s2,0); 834 &je (&label("enc_out")); 835 836 &pushf (); 837 &cld (); 838 839 &mov ($s0,&wparam(0)); # load inp 840 &mov ($s1,&wparam(1)); # load out 841 #&mov ($s2,&wparam(2)); # load len 842 &mov ($s3,&wparam(3)); # load key 843 &mov ($Tbl,&wparam(4)); # load ivp 844 845 # allocate aligned stack frame... 846 &lea ($idx,&DWP(-64,"esp")); 847 &and ($idx,-64); 848 849 # place stack frame just "above mod 1024" the key schedule 850 # this ensures that cache associativity of 2 suffices 851 &lea ($key,&DWP(-64-63,$s3)); 852 &sub ($key,$idx); 853 &neg ($key); 854 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line 855 &sub ($idx,$key); 856 857 &mov ($key,&wparam(5)); # load enc 858 859 &exch ("esp",$idx); 860 &add ("esp",4); # reserve for return address! 861 &mov ($_esp,$idx); # save %esp 862 863 &mov ($_inp,$s0); # save copy of inp 864 &mov ($_out,$s1); # save copy of out 865 &mov ($_len,$s2); # save copy of len 866 &mov ($_key,$s3); # save copy of key 867 &mov ($_ivp,$Tbl); # save copy of ivp 868 869 &call (&label("pic_point")); # make it PIC! 870 &set_label("pic_point"); 871 &blindpop($Tbl); 872 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 873 874 &mov ($idx,32); 875 &set_label("prefetch_sbox",4); 876 &mov ($s0,&DWP(0,$Tbl)); 877 &mov ($s1,&DWP(32,$Tbl)); 878 &mov ($s2,&DWP(64,$Tbl)); 879 &mov ($s3,&DWP(96,$Tbl)); 880 &lea ($Tbl,&DWP(128,$Tbl)); 881 &dec ($idx); 882 &jnz (&label("prefetch_sbox")); 883 &mov ($s0,$_key); 884 &sub ($Tbl,4096); 885 &mov ($idx,$_inp); 886 &mov ($s3,&DWP(272,$s0)); # load grandRounds 887 888 &cmp ($key,0); 889 &je (&label("DECRYPT")); 890 891 &mov ($s2,$_len); 892 &mov ($key,$_ivp); 893 &shl ($s3,6); 894 &lea ($s3,&DWP(0,$s0,$s3)); 895 &mov ($_end,$s3); 896 897 &test ($s2,0xFFFFFFF0); 898 &jz (&label("enc_tail")); # short input... 899 900 &mov ($s0,&DWP(0,$key)); # load iv 901 &mov ($s1,&DWP(4,$key)); 902 903 &set_label("enc_loop",4); 904 &mov ($s2,&DWP(8,$key)); 905 &mov ($s3,&DWP(12,$key)); 906 907 &xor ($s0,&DWP(0,$idx)); # xor input data 908 &xor ($s1,&DWP(4,$idx)); 909 &xor ($s2,&DWP(8,$idx)); 910 &bswap ($s0); 911 &xor ($s3,&DWP(12,$idx)); 912 &bswap ($s1); 913 &mov ($key,$_key); # load key 914 &bswap ($s2); 915 &bswap ($s3); 916 917 &call ("_x86_Camellia_encrypt"); 918 919 &mov ($idx,$_inp); # load inp 920 &mov ($key,$_out); # load out 921 922 &bswap ($s0); 923 &bswap ($s1); 924 &bswap ($s2); 925 &mov (&DWP(0,$key),$s0); # save output data 926 &bswap ($s3); 927 &mov (&DWP(4,$key),$s1); 928 &mov (&DWP(8,$key),$s2); 929 &mov (&DWP(12,$key),$s3); 930 931 &mov ($s2,$_len); # load len 932 933 &lea ($idx,&DWP(16,$idx)); 934 &mov ($_inp,$idx); # save inp 935 936 &lea ($s3,&DWP(16,$key)); 937 &mov ($_out,$s3); # save out 938 939 &sub ($s2,16); 940 &test ($s2,0xFFFFFFF0); 941 &mov ($_len,$s2); # save len 942 &jnz (&label("enc_loop")); 943 &test ($s2,15); 944 &jnz (&label("enc_tail")); 945 &mov ($idx,$_ivp); # load ivp 946 &mov ($s2,&DWP(8,$key)); # restore last dwords 947 &mov ($s3,&DWP(12,$key)); 948 &mov (&DWP(0,$idx),$s0); # save ivec 949 &mov (&DWP(4,$idx),$s1); 950 &mov (&DWP(8,$idx),$s2); 951 &mov (&DWP(12,$idx),$s3); 952 953 &mov ("esp",$_esp); 954 &popf (); 955 &set_label("enc_out"); 956 &function_end_A(); 957 &pushf (); # kludge, never executed 958 959 &set_label("enc_tail",4); 960 &mov ($s0,$key eq "edi" ? $key : ""); 961 &mov ($key,$_out); # load out 962 &push ($s0); # push ivp 963 &mov ($s1,16); 964 &sub ($s1,$s2); 965 &cmp ($key,$idx); # compare with inp 966 &je (&label("enc_in_place")); 967 &align (4); 968 &data_word(0xA4F3F689); # rep movsb # copy input 969 &jmp (&label("enc_skip_in_place")); 970 &set_label("enc_in_place"); 971 &lea ($key,&DWP(0,$key,$s2)); 972 &set_label("enc_skip_in_place"); 973 &mov ($s2,$s1); 974 &xor ($s0,$s0); 975 &align (4); 976 &data_word(0xAAF3F689); # rep stosb # zero tail 977 &pop ($key); # pop ivp 978 979 &mov ($idx,$_out); # output as input 980 &mov ($s0,&DWP(0,$key)); 981 &mov ($s1,&DWP(4,$key)); 982 &mov ($_len,16); # len=16 983 &jmp (&label("enc_loop")); # one more spin... 984 985#----------------------------- DECRYPT -----------------------------# 986&set_label("DECRYPT",16); 987 &shl ($s3,6); 988 &lea ($s3,&DWP(0,$s0,$s3)); 989 &mov ($_end,$s0); 990 &mov ($_key,$s3); 991 992 &cmp ($idx,$_out); 993 &je (&label("dec_in_place")); # in-place processing... 994 995 &mov ($key,$_ivp); # load ivp 996 &mov ($_tmp,$key); 997 998 &set_label("dec_loop",4); 999 &mov ($s0,&DWP(0,$idx)); # read input 1000 &mov ($s1,&DWP(4,$idx)); 1001 &mov ($s2,&DWP(8,$idx)); 1002 &bswap ($s0); 1003 &mov ($s3,&DWP(12,$idx)); 1004 &bswap ($s1); 1005 &mov ($key,$_key); # load key 1006 &bswap ($s2); 1007 &bswap ($s3); 1008 1009 &call ("_x86_Camellia_decrypt"); 1010 1011 &mov ($key,$_tmp); # load ivp 1012 &mov ($idx,$_len); # load len 1013 1014 &bswap ($s0); 1015 &bswap ($s1); 1016 &bswap ($s2); 1017 &xor ($s0,&DWP(0,$key)); # xor iv 1018 &bswap ($s3); 1019 &xor ($s1,&DWP(4,$key)); 1020 &xor ($s2,&DWP(8,$key)); 1021 &xor ($s3,&DWP(12,$key)); 1022 1023 &sub ($idx,16); 1024 &jc (&label("dec_partial")); 1025 &mov ($_len,$idx); # save len 1026 &mov ($idx,$_inp); # load inp 1027 &mov ($key,$_out); # load out 1028 1029 &mov (&DWP(0,$key),$s0); # write output 1030 &mov (&DWP(4,$key),$s1); 1031 &mov (&DWP(8,$key),$s2); 1032 &mov (&DWP(12,$key),$s3); 1033 1034 &mov ($_tmp,$idx); # save ivp 1035 &lea ($idx,&DWP(16,$idx)); 1036 &mov ($_inp,$idx); # save inp 1037 1038 &lea ($key,&DWP(16,$key)); 1039 &mov ($_out,$key); # save out 1040 1041 &jnz (&label("dec_loop")); 1042 &mov ($key,$_tmp); # load temp ivp 1043 &set_label("dec_end"); 1044 &mov ($idx,$_ivp); # load user ivp 1045 &mov ($s0,&DWP(0,$key)); # load iv 1046 &mov ($s1,&DWP(4,$key)); 1047 &mov ($s2,&DWP(8,$key)); 1048 &mov ($s3,&DWP(12,$key)); 1049 &mov (&DWP(0,$idx),$s0); # copy back to user 1050 &mov (&DWP(4,$idx),$s1); 1051 &mov (&DWP(8,$idx),$s2); 1052 &mov (&DWP(12,$idx),$s3); 1053 &jmp (&label("dec_out")); 1054 1055 &set_label("dec_partial",4); 1056 &lea ($key,$ivec); 1057 &mov (&DWP(0,$key),$s0); # dump output to stack 1058 &mov (&DWP(4,$key),$s1); 1059 &mov (&DWP(8,$key),$s2); 1060 &mov (&DWP(12,$key),$s3); 1061 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); 1062 &mov ($idx eq "esi" ? $idx : "",$key); 1063 &mov ($key eq "edi" ? $key : "",$_out); # load out 1064 &data_word(0xA4F3F689); # rep movsb # copy output 1065 &mov ($key,$_inp); # use inp as temp ivp 1066 &jmp (&label("dec_end")); 1067 1068 &set_label("dec_in_place",4); 1069 &set_label("dec_in_place_loop"); 1070 &lea ($key,$ivec); 1071 &mov ($s0,&DWP(0,$idx)); # read input 1072 &mov ($s1,&DWP(4,$idx)); 1073 &mov ($s2,&DWP(8,$idx)); 1074 &mov ($s3,&DWP(12,$idx)); 1075 1076 &mov (&DWP(0,$key),$s0); # copy to temp 1077 &mov (&DWP(4,$key),$s1); 1078 &mov (&DWP(8,$key),$s2); 1079 &bswap ($s0); 1080 &mov (&DWP(12,$key),$s3); 1081 &bswap ($s1); 1082 &mov ($key,$_key); # load key 1083 &bswap ($s2); 1084 &bswap ($s3); 1085 1086 &call ("_x86_Camellia_decrypt"); 1087 1088 &mov ($key,$_ivp); # load ivp 1089 &mov ($idx,$_out); # load out 1090 1091 &bswap ($s0); 1092 &bswap ($s1); 1093 &bswap ($s2); 1094 &xor ($s0,&DWP(0,$key)); # xor iv 1095 &bswap ($s3); 1096 &xor ($s1,&DWP(4,$key)); 1097 &xor ($s2,&DWP(8,$key)); 1098 &xor ($s3,&DWP(12,$key)); 1099 1100 &mov (&DWP(0,$idx),$s0); # write output 1101 &mov (&DWP(4,$idx),$s1); 1102 &mov (&DWP(8,$idx),$s2); 1103 &mov (&DWP(12,$idx),$s3); 1104 1105 &lea ($idx,&DWP(16,$idx)); 1106 &mov ($_out,$idx); # save out 1107 1108 &lea ($idx,$ivec); 1109 &mov ($s0,&DWP(0,$idx)); # read temp 1110 &mov ($s1,&DWP(4,$idx)); 1111 &mov ($s2,&DWP(8,$idx)); 1112 &mov ($s3,&DWP(12,$idx)); 1113 1114 &mov (&DWP(0,$key),$s0); # copy iv 1115 &mov (&DWP(4,$key),$s1); 1116 &mov (&DWP(8,$key),$s2); 1117 &mov (&DWP(12,$key),$s3); 1118 1119 &mov ($idx,$_inp); # load inp 1120 1121 &lea ($idx,&DWP(16,$idx)); 1122 &mov ($_inp,$idx); # save inp 1123 1124 &mov ($s2,$_len); # load len 1125 &sub ($s2,16); 1126 &jc (&label("dec_in_place_partial")); 1127 &mov ($_len,$s2); # save len 1128 &jnz (&label("dec_in_place_loop")); 1129 &jmp (&label("dec_out")); 1130 1131 &set_label("dec_in_place_partial",4); 1132 # one can argue if this is actually required... 1133 &mov ($key eq "edi" ? $key : "",$_out); 1134 &lea ($idx eq "esi" ? $idx : "",$ivec); 1135 &lea ($key,&DWP(0,$key,$s2)); 1136 &lea ($idx,&DWP(16,$idx,$s2)); 1137 &neg ($s2 eq "ecx" ? $s2 : ""); 1138 &data_word(0xA4F3F689); # rep movsb # restore tail 1139 1140 &set_label("dec_out",4); 1141 &mov ("esp",$_esp); 1142 &popf (); 1143&function_end("Camellia_cbc_encrypt"); 1144} 1145 1146&asciz("Camellia for x86 by <appro\@openssl.org>"); 1147 1148&asm_finish(); 1149 1150close STDOUT or die "error closing STDOUT: $!"; 1151