1#! /usr/bin/env perl 2# Copyright 1998-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# At some point it became apparent that the original SSLeay RC4 18# assembler implementation performs suboptimally on latest IA-32 19# microarchitectures. After re-tuning performance has changed as 20# following: 21# 22# Pentium -10% 23# Pentium III +12% 24# AMD +50%(*) 25# P4 +250%(**) 26# 27# (*) This number is actually a trade-off:-) It's possible to 28# achieve +72%, but at the cost of -48% off PIII performance. 29# In other words code performing further 13% faster on AMD 30# would perform almost 2 times slower on Intel PIII... 31# For reference! This code delivers ~80% of rc4-amd64.pl 32# performance on the same Opteron machine. 33# (**) This number requires compressed key schedule set up by 34# RC4_set_key [see commentary below for further details]. 35 36# May 2011 37# 38# Optimize for Core2 and Westmere [and incidentally Opteron]. Current 39# performance in cycles per processed byte (less is better) and 40# improvement relative to previous version of this module is: 41# 42# Pentium 10.2 # original numbers 43# Pentium III 7.8(*) 44# Intel P4 7.5 45# 46# Opteron 6.1/+20% # new MMX numbers 47# Core2 5.3/+67%(**) 48# Westmere 5.1/+94%(**) 49# Sandy Bridge 5.0/+8% 50# Atom 12.6/+6% 51# VIA Nano 6.4/+9% 52# Ivy Bridge 4.9/±0% 53# Bulldozer 4.9/+15% 54# 55# (*) PIII can actually deliver 6.6 cycles per byte with MMX code, 56# but this specific code performs poorly on Core2. And vice 57# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs 58# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU 59# [anymore], I chose to discard PIII-specific code path and opt 60# for original IALU-only code, which is why MMX/SSE code path 61# is guarded by SSE2 bit (see below), not MMX/SSE. 62# (**) Performance vs. block size on Core2 and Westmere had a maximum 63# at ... 64 bytes block size. And it was quite a maximum, 40-60% 64# in comparison to largest 8KB block size. Above improvement 65# coefficients are for the largest block size. 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68push(@INC,"${dir}","${dir}../../perlasm"); 69require "x86asm.pl"; 70 71$output=pop; 72open STDOUT,">$output"; 73 74&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); 75 76$xx="eax"; 77$yy="ebx"; 78$tx="ecx"; 79$ty="edx"; 80$inp="esi"; 81$out="ebp"; 82$dat="edi"; 83 84sub RC4_loop { 85 my $i=shift; 86 my $func = ($i==0)?*mov:*or; 87 88 &add (&LB($yy),&LB($tx)); 89 &mov ($ty,&DWP(0,$dat,$yy,4)); 90 &mov (&DWP(0,$dat,$yy,4),$tx); 91 &mov (&DWP(0,$dat,$xx,4),$ty); 92 &add ($ty,$tx); 93 &inc (&LB($xx)); 94 &and ($ty,0xff); 95 &ror ($out,8) if ($i!=0); 96 if ($i<3) { 97 &mov ($tx,&DWP(0,$dat,$xx,4)); 98 } else { 99 &mov ($tx,&wparam(3)); # reload [re-biased] out 100 } 101 &$func ($out,&DWP(0,$dat,$ty,4)); 102} 103 104if ($alt=0) { 105 # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron, 106 # but ~40% slower on Core2 and Westmere... Attempt to add movz 107 # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet 108 # on Core2 with movz it's almost 20% slower than below alternative 109 # code... Yes, it's a total mess... 110 my @XX=($xx,$out); 111 $RC4_loop_mmx = sub { # SSE actually... 112 my $i=shift; 113 my $j=$i<=0?0:$i>>1; 114 my $mm=$i<=0?"mm0":"mm".($i&1); 115 116 &add (&LB($yy),&LB($tx)); 117 &lea (@XX[1],&DWP(1,@XX[0])); 118 &pxor ("mm2","mm0") if ($i==0); 119 &psllq ("mm1",8) if ($i==0); 120 &and (@XX[1],0xff); 121 &pxor ("mm0","mm0") if ($i<=0); 122 &mov ($ty,&DWP(0,$dat,$yy,4)); 123 &mov (&DWP(0,$dat,$yy,4),$tx); 124 &pxor ("mm1","mm2") if ($i==0); 125 &mov (&DWP(0,$dat,$XX[0],4),$ty); 126 &add (&LB($ty),&LB($tx)); 127 &movd (@XX[0],"mm7") if ($i==0); 128 &mov ($tx,&DWP(0,$dat,@XX[1],4)); 129 &pxor ("mm1","mm1") if ($i==1); 130 &movq ("mm2",&QWP(0,$inp)) if ($i==1); 131 &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0); 132 &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j); 133 134 push (@XX,shift(@XX)) if ($i>=0); 135 } 136} else { 137 # Using pinsrw here improves performance on Intel CPUs by 2-3%, but 138 # brings down AMD by 7%... 139 $RC4_loop_mmx = sub { 140 my $i=shift; 141 142 &add (&LB($yy),&LB($tx)); 143 &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); 144 &mov ($ty,&DWP(0,$dat,$yy,4)); 145 &mov (&DWP(0,$dat,$yy,4),$tx); 146 &mov (&DWP(0,$dat,$xx,4),$ty); 147 &inc ($xx); 148 &add ($ty,$tx); 149 &movz ($xx,&LB($xx)); # (*) 150 &movz ($ty,&LB($ty)); # (*) 151 &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); 152 &movq ("mm0",&QWP(0,$inp)) if ($i<=0); 153 &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); 154 &mov ($tx,&DWP(0,$dat,$xx,4)); 155 &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); 156 157 # (*) This is the key to Core2 and Westmere performance. 158 # Without movz out-of-order execution logic confuses 159 # itself and fails to reorder loads and stores. Problem 160 # appears to be fixed in Sandy Bridge... 161 } 162} 163 164&external_label("OPENSSL_ia32cap_P"); 165 166# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); 167&function_begin("RC4"); 168 &mov ($dat,&wparam(0)); # load key schedule pointer 169 &mov ($ty, &wparam(1)); # load len 170 &mov ($inp,&wparam(2)); # load inp 171 &mov ($out,&wparam(3)); # load out 172 173 &xor ($xx,$xx); # avoid partial register stalls 174 &xor ($yy,$yy); 175 176 &cmp ($ty,0); # safety net 177 &je (&label("abort")); 178 179 &mov (&LB($xx),&BP(0,$dat)); # load key->x 180 &mov (&LB($yy),&BP(4,$dat)); # load key->y 181 &add ($dat,8); 182 183 &lea ($tx,&DWP(0,$inp,$ty)); 184 &sub ($out,$inp); # re-bias out 185 &mov (&wparam(1),$tx); # save input+len 186 187 &inc (&LB($xx)); 188 189 # detect compressed key schedule... 190 &cmp (&DWP(256,$dat),-1); 191 &je (&label("RC4_CHAR")); 192 193 &mov ($tx,&DWP(0,$dat,$xx,4)); 194 195 &and ($ty,-4); # how many 4-byte chunks? 196 &jz (&label("loop1")); 197 198 &mov (&wparam(3),$out); # $out as accumulator in these loops 199 if ($x86only) { 200 &jmp (&label("go4loop4")); 201 } else { 202 &test ($ty,-8); 203 &jz (&label("go4loop4")); 204 205 &picmeup($out,"OPENSSL_ia32cap_P"); 206 &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] 207 &jnc (&label("go4loop4")); 208 209 &mov ($out,&wparam(3)) if (!$alt); 210 &movd ("mm7",&wparam(3)) if ($alt); 211 &and ($ty,-8); 212 &lea ($ty,&DWP(-8,$inp,$ty)); 213 &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 214 215 &$RC4_loop_mmx(-1); 216 &jmp(&label("loop_mmx_enter")); 217 218 &set_label("loop_mmx",16); 219 &$RC4_loop_mmx(0); 220 &set_label("loop_mmx_enter"); 221 for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } 222 &mov ($ty,$yy); 223 &xor ($yy,$yy); # this is second key to Core2 224 &mov (&LB($yy),&LB($ty)); # and Westmere performance... 225 &cmp ($inp,&DWP(-4,$dat)); 226 &lea ($inp,&DWP(8,$inp)); 227 &jb (&label("loop_mmx")); 228 229 if ($alt) { 230 &movd ($out,"mm7"); 231 &pxor ("mm2","mm0"); 232 &psllq ("mm1",8); 233 &pxor ("mm1","mm2"); 234 &movq (&QWP(-8,$out,$inp),"mm1"); 235 } else { 236 &psllq ("mm1",56); 237 &pxor ("mm2","mm1"); 238 &movq (&QWP(-8,$out,$inp),"mm2"); 239 } 240 &emms (); 241 242 &cmp ($inp,&wparam(1)); # compare to input+len 243 &je (&label("done")); 244 &jmp (&label("loop1")); 245 } 246 247&set_label("go4loop4",16); 248 &lea ($ty,&DWP(-4,$inp,$ty)); 249 &mov (&wparam(2),$ty); # save input+(len/4)*4-4 250 251 &set_label("loop4"); 252 for ($i=0;$i<4;$i++) { RC4_loop($i); } 253 &ror ($out,8); 254 &xor ($out,&DWP(0,$inp)); 255 &cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4 256 &mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here 257 &lea ($inp,&DWP(4,$inp)); 258 &mov ($tx,&DWP(0,$dat,$xx,4)); 259 &jb (&label("loop4")); 260 261 &cmp ($inp,&wparam(1)); # compare to input+len 262 &je (&label("done")); 263 &mov ($out,&wparam(3)); # restore $out 264 265 &set_label("loop1",16); 266 &add (&LB($yy),&LB($tx)); 267 &mov ($ty,&DWP(0,$dat,$yy,4)); 268 &mov (&DWP(0,$dat,$yy,4),$tx); 269 &mov (&DWP(0,$dat,$xx,4),$ty); 270 &add ($ty,$tx); 271 &inc (&LB($xx)); 272 &and ($ty,0xff); 273 &mov ($ty,&DWP(0,$dat,$ty,4)); 274 &xor (&LB($ty),&BP(0,$inp)); 275 &lea ($inp,&DWP(1,$inp)); 276 &mov ($tx,&DWP(0,$dat,$xx,4)); 277 &cmp ($inp,&wparam(1)); # compare to input+len 278 &mov (&BP(-1,$out,$inp),&LB($ty)); 279 &jb (&label("loop1")); 280 281 &jmp (&label("done")); 282 283# this is essentially Intel P4 specific codepath... 284&set_label("RC4_CHAR",16); 285 &movz ($tx,&BP(0,$dat,$xx)); 286 # strangely enough unrolled loop performs over 20% slower... 287 &set_label("cloop1"); 288 &add (&LB($yy),&LB($tx)); 289 &movz ($ty,&BP(0,$dat,$yy)); 290 &mov (&BP(0,$dat,$yy),&LB($tx)); 291 &mov (&BP(0,$dat,$xx),&LB($ty)); 292 &add (&LB($ty),&LB($tx)); 293 &movz ($ty,&BP(0,$dat,$ty)); 294 &add (&LB($xx),1); 295 &xor (&LB($ty),&BP(0,$inp)); 296 &lea ($inp,&DWP(1,$inp)); 297 &movz ($tx,&BP(0,$dat,$xx)); 298 &cmp ($inp,&wparam(1)); 299 &mov (&BP(-1,$out,$inp),&LB($ty)); 300 &jb (&label("cloop1")); 301 302&set_label("done"); 303 &dec (&LB($xx)); 304 &mov (&DWP(-4,$dat),$yy); # save key->y 305 &mov (&BP(-8,$dat),&LB($xx)); # save key->x 306&set_label("abort"); 307&function_end("RC4"); 308 309######################################################################## 310 311$inp="esi"; 312$out="edi"; 313$idi="ebp"; 314$ido="ecx"; 315$idx="edx"; 316 317# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); 318&function_begin("RC4_set_key"); 319 &mov ($out,&wparam(0)); # load key 320 &mov ($idi,&wparam(1)); # load len 321 &mov ($inp,&wparam(2)); # load data 322 &picmeup($idx,"OPENSSL_ia32cap_P"); 323 324 &lea ($out,&DWP(2*4,$out)); # &key->data 325 &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end 326 &neg ($idi); 327 &xor ("eax","eax"); 328 &mov (&DWP(-4,$out),$idi); # borrow key->y 329 330 &bt (&DWP(0,$idx),20); # check for bit#20 331 &jc (&label("c1stloop")); 332 333&set_label("w1stloop",16); 334 &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i; 335 &add (&LB("eax"),1); # i++; 336 &jnc (&label("w1stloop")); 337 338 &xor ($ido,$ido); 339 &xor ($idx,$idx); 340 341&set_label("w2ndloop",16); 342 &mov ("eax",&DWP(0,$out,$ido,4)); 343 &add (&LB($idx),&BP(0,$inp,$idi)); 344 &add (&LB($idx),&LB("eax")); 345 &add ($idi,1); 346 &mov ("ebx",&DWP(0,$out,$idx,4)); 347 &jnz (&label("wnowrap")); 348 &mov ($idi,&DWP(-4,$out)); 349 &set_label("wnowrap"); 350 &mov (&DWP(0,$out,$idx,4),"eax"); 351 &mov (&DWP(0,$out,$ido,4),"ebx"); 352 &add (&LB($ido),1); 353 &jnc (&label("w2ndloop")); 354&jmp (&label("exit")); 355 356# Unlike all other x86 [and x86_64] implementations, Intel P4 core 357# [including EM64T] was found to perform poorly with above "32-bit" key 358# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded 359# assembler turned out to be 3.5x if re-coded for compressed 8-bit one, 360# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit 361# schedule for x86[_64], because non-P4 implementations suffer from 362# significant performance losses then, e.g. PIII exhibits >2x 363# deterioration, and so does Opteron. In order to assure optimal 364# all-round performance, we detect P4 at run-time and set up compressed 365# key schedule, which is recognized by RC4 procedure. 366 367&set_label("c1stloop",16); 368 &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i; 369 &add (&LB("eax"),1); # i++; 370 &jnc (&label("c1stloop")); 371 372 &xor ($ido,$ido); 373 &xor ($idx,$idx); 374 &xor ("ebx","ebx"); 375 376&set_label("c2ndloop",16); 377 &mov (&LB("eax"),&BP(0,$out,$ido)); 378 &add (&LB($idx),&BP(0,$inp,$idi)); 379 &add (&LB($idx),&LB("eax")); 380 &add ($idi,1); 381 &mov (&LB("ebx"),&BP(0,$out,$idx)); 382 &jnz (&label("cnowrap")); 383 &mov ($idi,&DWP(-4,$out)); 384 &set_label("cnowrap"); 385 &mov (&BP(0,$out,$idx),&LB("eax")); 386 &mov (&BP(0,$out,$ido),&LB("ebx")); 387 &add (&LB($ido),1); 388 &jnc (&label("c2ndloop")); 389 390 &mov (&DWP(256,$out),-1); # mark schedule as compressed 391 392&set_label("exit"); 393 &xor ("eax","eax"); 394 &mov (&DWP(-8,$out),"eax"); # key->x=0; 395 &mov (&DWP(-4,$out),"eax"); # key->y=0; 396&function_end("RC4_set_key"); 397 398# const char *RC4_options(void); 399&function_begin_B("RC4_options"); 400 &call (&label("pic_point")); 401&set_label("pic_point"); 402 &blindpop("eax"); 403 &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); 404 &picmeup("edx","OPENSSL_ia32cap_P"); 405 &mov ("edx",&DWP(0,"edx")); 406 &bt ("edx",20); 407 &jc (&label("1xchar")); 408 &bt ("edx",26); 409 &jnc (&label("ret")); 410 &add ("eax",25); 411 &ret (); 412&set_label("1xchar"); 413 &add ("eax",12); 414&set_label("ret"); 415 &ret (); 416&set_label("opts",64); 417&asciz ("rc4(4x,int)"); 418&asciz ("rc4(1x,char)"); 419&asciz ("rc4(8x,mmx)"); 420&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 421&align (64); 422&function_end_B("RC4_options"); 423 424&asm_finish(); 425 426close STDOUT or die "error closing STDOUT: $!"; 427