1#!/usr/local/bin/perl 2 3# At some point it became apparent that the original SSLeay RC4 4# assembler implementation performs suboptimaly on latest IA-32 5# microarchitectures. After re-tuning performance has changed as 6# following: 7# 8# Pentium +0% 9# Pentium III +17% 10# AMD +52%(*) 11# P4 +180%(**) 12# 13# (*) This number is actually a trade-off:-) It's possible to 14# achieve +72%, but at the cost of -48% off PIII performance. 15# In other words code performing further 13% faster on AMD 16# would perform almost 2 times slower on Intel PIII... 17# For reference! This code delivers ~80% of rc4-amd64.pl 18# performance on the same Opteron machine. 19# (**) This number requires compressed key schedule set up by 20# RC4_set_key and therefore doesn't apply to 0.9.7 [option for 21# compressed key schedule is implemented in 0.9.8 and later, 22# see commentary section in rc4_skey.c for further details]. 23# 24# <appro@fy.chalmers.se> 25 26push(@INC,"perlasm","../../perlasm"); 27require "x86asm.pl"; 28 29&asm_init($ARGV[0],"rc4-586.pl"); 30 31$x="eax"; 32$y="ebx"; 33$tx="ecx"; 34$ty="edx"; 35$in="esi"; 36$out="edi"; 37$d="ebp"; 38 39&RC4("RC4"); 40 41&asm_finish(); 42 43sub RC4_loop 44 { 45 local($n,$p,$char)=@_; 46 47 &comment("Round $n"); 48 49 if ($char) 50 { 51 if ($p >= 0) 52 { 53 &mov($ty, &swtmp(2)); 54 &cmp($ty, $in); 55 &jbe(&label("finished")); 56 &inc($in); 57 } 58 else 59 { 60 &add($ty, 8); 61 &inc($in); 62 &cmp($ty, $in); 63 &jb(&label("finished")); 64 &mov(&swtmp(2), $ty); 65 } 66 } 67 # Moved out 68 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; 69 70 &add( &LB($y), &LB($tx)); 71 &mov( $ty, &DWP(0,$d,$y,4)); 72 # XXX 73 &mov( &DWP(0,$d,$x,4),$ty); 74 &add( $ty, $tx); 75 &mov( &DWP(0,$d,$y,4),$tx); 76 &and( $ty, 0xff); 77 &inc( &LB($x)); # NEXT ROUND 78 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND 79 &mov( $ty, &DWP(0,$d,$ty,4)); 80 81 if (!$char) 82 { 83 #moved up into last round 84 if ($p >= 1) 85 { 86 &add( $out, 8) 87 } 88 &movb( &BP($n,"esp","",0), &LB($ty)); 89 } 90 else 91 { 92 # Note in+=8 has occured 93 &movb( &HB($ty), &BP(-1,$in,"",0)); 94 # XXX 95 &xorb(&LB($ty), &HB($ty)); 96 # XXX 97 &movb(&BP($n,$out,"",0),&LB($ty)); 98 } 99 } 100 101 102sub RC4 103 { 104 local($name)=@_; 105 106 &function_begin_B($name,""); 107 108 &mov($ty,&wparam(1)); # len 109 &cmp($ty,0); 110 &jne(&label("proceed")); 111 &ret(); 112 &set_label("proceed"); 113 114 &comment(""); 115 116 &push("ebp"); 117 &push("ebx"); 118 &push("esi"); 119 &xor( $x, $x); # avoid partial register stalls 120 &push("edi"); 121 &xor( $y, $y); # avoid partial register stalls 122 &mov( $d, &wparam(0)); # key 123 &mov( $in, &wparam(2)); 124 125 &movb( &LB($x), &BP(0,$d,"",1)); 126 &movb( &LB($y), &BP(4,$d,"",1)); 127 128 &mov( $out, &wparam(3)); 129 &inc( &LB($x)); 130 131 &stack_push(3); # 3 temp variables 132 &add( $d, 8); 133 134 # detect compressed schedule, see commentary section in rc4_skey.c... 135 # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, 136 # as compressed key schedule is set up in 0.9.8 and later. 137 &cmp(&DWP(256,$d),-1); 138 &je(&label("RC4_CHAR")); 139 140 &lea( $ty, &DWP(-8,$ty,$in)); 141 142 # check for 0 length input 143 144 &mov( &swtmp(2), $ty); # this is now address to exit at 145 &mov( $tx, &DWP(0,$d,$x,4)); 146 147 &cmp( $ty, $in); 148 &jb( &label("end")); # less than 8 bytes 149 150 &set_label("start"); 151 152 # filling DELAY SLOT 153 &add( $in, 8); 154 155 &RC4_loop(0,-1,0); 156 &RC4_loop(1,0,0); 157 &RC4_loop(2,0,0); 158 &RC4_loop(3,0,0); 159 &RC4_loop(4,0,0); 160 &RC4_loop(5,0,0); 161 &RC4_loop(6,0,0); 162 &RC4_loop(7,1,0); 163 164 &comment("apply the cipher text"); 165 # xor the cipher data with input 166 167 #&add( $out, 8); #moved up into last round 168 169 &mov( $tx, &swtmp(0)); 170 &mov( $ty, &DWP(-8,$in,"",0)); 171 &xor( $tx, $ty); 172 &mov( $ty, &DWP(-4,$in,"",0)); 173 &mov( &DWP(-8,$out,"",0), $tx); 174 &mov( $tx, &swtmp(1)); 175 &xor( $tx, $ty); 176 &mov( $ty, &swtmp(2)); # load end ptr; 177 &mov( &DWP(-4,$out,"",0), $tx); 178 &mov( $tx, &DWP(0,$d,$x,4)); 179 &cmp($in, $ty); 180 &jbe(&label("start")); 181 182 &set_label("end"); 183 184 # There is quite a bit of extra crap in RC4_loop() for this 185 # first round 186 &RC4_loop(0,-1,1); 187 &RC4_loop(1,0,1); 188 &RC4_loop(2,0,1); 189 &RC4_loop(3,0,1); 190 &RC4_loop(4,0,1); 191 &RC4_loop(5,0,1); 192 &RC4_loop(6,1,1); 193 194 &jmp(&label("finished")); 195 196 &align(16); 197 # this is essentially Intel P4 specific codepath, see rc4_skey.c, 198 # and is engaged in 0.9.8 and later context... 199 &set_label("RC4_CHAR"); 200 201 &lea ($ty,&DWP(0,$in,$ty)); 202 &mov (&swtmp(2),$ty); 203 &movz ($tx,&BP(0,$d,$x)); 204 205 # strangely enough unrolled loop performs over 20% slower... 206 &set_label("RC4_CHAR_loop"); 207 &add (&LB($y),&LB($tx)); 208 &movz ($ty,&BP(0,$d,$y)); 209 &movb (&BP(0,$d,$y),&LB($tx)); 210 &movb (&BP(0,$d,$x),&LB($ty)); 211 &add (&LB($ty),&LB($tx)); 212 &movz ($ty,&BP(0,$d,$ty)); 213 &add (&LB($x),1); 214 &xorb (&LB($ty),&BP(0,$in)); 215 &lea ($in,&DWP(1,$in)); 216 &movz ($tx,&BP(0,$d,$x)); 217 &cmp ($in,&swtmp(2)); 218 &movb (&BP(0,$out),&LB($ty)); 219 &lea ($out,&DWP(1,$out)); 220 &jb (&label("RC4_CHAR_loop")); 221 222 &set_label("finished"); 223 &dec( $x); 224 &stack_pop(3); 225 &movb( &BP(-4,$d,"",0),&LB($y)); 226 &movb( &BP(-8,$d,"",0),&LB($x)); 227 228 &function_end($name); 229 } 230 231