1#!/usr/bin/env perl 2 3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 4push(@INC, "${dir}perlasm", "perlasm"); 5require "x86asm.pl"; 6 7&asm_init($ARGV[0],"crypto/cpu-x86-asm"); 8 9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 10 11&function_begin("OPENSSL_ia32_cpuid"); 12 &xor ("edx","edx"); 13 &pushf (); 14 &pop ("eax"); 15 &mov ("ecx","eax"); 16 &xor ("eax",1<<21); 17 &push ("eax"); 18 &popf (); 19 &pushf (); 20 &pop ("eax"); 21 &xor ("ecx","eax"); 22 &xor ("eax","eax"); 23 &bt ("ecx",21); 24 &jnc (&label("nocpuid")); 25 &mov ("esi",&wparam(0)); 26 &mov (&DWP(8,"esi"),"eax"); # clear 3rd word 27 &cpuid (); 28 &mov ("edi","eax"); # max value for standard query level 29 30 &xor ("eax","eax"); 31 &cmp ("ebx",0x756e6547); # "Genu" 32 &setne (&LB("eax")); 33 &mov ("ebp","eax"); 34 &cmp ("edx",0x49656e69); # "ineI" 35 &setne (&LB("eax")); 36 &or ("ebp","eax"); 37 &cmp ("ecx",0x6c65746e); # "ntel" 38 &setne (&LB("eax")); 39 &or ("ebp","eax"); # 0 indicates Intel CPU 40 &jz (&label("intel")); 41 42 &cmp ("ebx",0x68747541); # "Auth" 43 &setne (&LB("eax")); 44 &mov ("esi","eax"); 45 &cmp ("edx",0x69746E65); # "enti" 46 &setne (&LB("eax")); 47 &or ("esi","eax"); 48 &cmp ("ecx",0x444D4163); # "cAMD" 49 &setne (&LB("eax")); 50 &or ("esi","eax"); # 0 indicates AMD CPU 51 &jnz (&label("intel")); 52 53 # AMD specific 54 &mov ("eax",0x80000000); 55 &cpuid (); 56 &cmp ("eax",0x80000001); 57 &jb (&label("intel")); 58 &mov ("esi","eax"); 59 &mov ("eax",0x80000001); 60 &cpuid (); 61 &or ("ebp","ecx"); 62 &and ("ebp",1<<11|1); # isolate XOP bit 63 &cmp ("esi",0x80000008); 64 &jb (&label("intel")); 65 66 &mov ("eax",0x80000008); 67 &cpuid (); 68 &movz ("esi",&LB("ecx")); # number of cores - 1 69 &inc ("esi"); # number of cores 70 71 &mov ("eax",1); 72 &xor ("ecx","ecx"); 73 &cpuid (); 74 &bt ("edx",28); 75 &jnc (&label("generic")); 76 &shr ("ebx",16); 77 &and ("ebx",0xff); 78 &cmp ("ebx","esi"); 79 &ja (&label("generic")); 80 &and ("edx",0xefffffff); # clear hyper-threading bit 81 &jmp (&label("generic")); 82 83&set_label("intel"); 84 &cmp ("edi",7); 85 &jb (&label("cacheinfo")); 86 87 &mov ("esi",&wparam(0)); 88 &mov ("eax",7); 89 &xor ("ecx","ecx"); 90 &cpuid (); 91 &mov (&DWP(8,"esi"),"ebx"); 92 93&set_label("cacheinfo"); 94 &cmp ("edi",4); 95 &mov ("edi",-1); 96 &jb (&label("nocacheinfo")); 97 98 &mov ("eax",4); 99 &mov ("ecx",0); # query L1D 100 &cpuid (); 101 &mov ("edi","eax"); 102 &shr ("edi",14); 103 &and ("edi",0xfff); # number of cores -1 per L1D 104 105&set_label("nocacheinfo"); 106 &mov ("eax",1); 107 &xor ("ecx","ecx"); 108 &cpuid (); 109 &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 110 &cmp ("ebp",0); 111 &jne (&label("notintel")); 112 &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs 113&set_label("notintel"); 114 &bt ("edx",28); # test hyper-threading bit 115 &jnc (&label("generic")); 116 &and ("edx",0xefffffff); 117 &cmp ("edi",0); 118 &je (&label("generic")); 119 120 &or ("edx",0x10000000); 121 &shr ("ebx",16); 122 &cmp (&LB("ebx"),1); 123 &ja (&label("generic")); 124 &and ("edx",0xefffffff); # clear hyper-threading bit if not 125 126&set_label("generic"); 127 &and ("ebp",1<<11); # isolate AMD XOP flag 128 &and ("ecx",0xfffff7ff); # force 11th bit to 0 129 &mov ("esi","edx"); 130 &or ("ebp","ecx"); # merge AMD XOP flag 131 132 &bt ("ecx",27); # check OSXSAVE bit 133 &jnc (&label("clear_avx")); 134 &xor ("ecx","ecx"); 135 &data_byte(0x0f,0x01,0xd0); # xgetbv 136 &and ("eax",6); 137 &cmp ("eax",6); 138 &je (&label("done")); 139 &cmp ("eax",2); 140 &je (&label("clear_avx")); 141&set_label("clear_xmm"); 142 &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits 143 &and ("esi",0xfeffffff); # clear FXSR 144&set_label("clear_avx"); 145 &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits 146 &mov ("edi",&wparam(0)); 147 &and (&DWP(8,"edi"),0xffffffdf); # clear AVX2 148&set_label("done"); 149 &mov ("eax","esi"); 150 &mov ("edx","ebp"); 151&set_label("nocpuid"); 152&function_end("OPENSSL_ia32_cpuid"); 153 154&external_label("OPENSSL_ia32cap_P"); 155 156&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 157 &xor ("eax","eax"); 158 &xor ("edx","edx"); 159 &picmeup("ecx","OPENSSL_ia32cap_P"); 160 &bt (&DWP(0,"ecx"),4); 161 &jnc (&label("notsc")); 162 &rdtsc (); 163&set_label("notsc"); 164 &ret (); 165&function_end_B("OPENSSL_rdtsc"); 166 167# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host], 168# but it's safe to call it on any [supported] 32-bit platform... 169# Just check for [non-]zero return value... 170&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 171 &picmeup("ecx","OPENSSL_ia32cap_P"); 172 &bt (&DWP(0,"ecx"),4); 173 &jnc (&label("nohalt")); # no TSC 174 175 &data_word(0x9058900e); # push %cs; pop %eax 176 &and ("eax",3); 177 &jnz (&label("nohalt")); # not enough privileges 178 179 &pushf (); 180 &pop ("eax"); 181 &bt ("eax",9); 182 &jnc (&label("nohalt")); # interrupts are disabled 183 184 &rdtsc (); 185 &push ("edx"); 186 &push ("eax"); 187 &halt (); 188 &rdtsc (); 189 190 &sub ("eax",&DWP(0,"esp")); 191 &sbb ("edx",&DWP(4,"esp")); 192 &add ("esp",8); 193 &ret (); 194 195&set_label("nohalt"); 196 &xor ("eax","eax"); 197 &xor ("edx","edx"); 198 &ret (); 199&function_end_B("OPENSSL_instrument_halt"); 200 201# Essentially there is only one use for this function. Under DJGPP: 202# 203# #include <go32.h> 204# ... 205# i=OPENSSL_far_spin(_dos_ds,0x46c); 206# ... 207# to obtain the number of spins till closest timer interrupt. 208 209&function_begin_B("OPENSSL_far_spin"); 210 &pushf (); 211 &pop ("eax"); 212 &bt ("eax",9); 213 &jnc (&label("nospin")); # interrupts are disabled 214 215 &mov ("eax",&DWP(4,"esp")); 216 &mov ("ecx",&DWP(8,"esp")); 217 &data_word (0x90d88e1e); # push %ds, mov %eax,%ds 218 &xor ("eax","eax"); 219 &mov ("edx",&DWP(0,"ecx")); 220 &jmp (&label("spin")); 221 222 &align (16); 223&set_label("spin"); 224 &inc ("eax"); 225 &cmp ("edx",&DWP(0,"ecx")); 226 &je (&label("spin")); 227 228 &data_word (0x1f909090); # pop %ds 229 &ret (); 230 231&set_label("nospin"); 232 &xor ("eax","eax"); 233 &xor ("edx","edx"); 234 &ret (); 235&function_end_B("OPENSSL_far_spin"); 236 237&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); 238 &xor ("eax","eax"); 239 &xor ("edx","edx"); 240 &picmeup("ecx","OPENSSL_ia32cap_P"); 241 &mov ("ecx",&DWP(0,"ecx")); 242 &bt (&DWP(0,"ecx"),1); 243 &jnc (&label("no_x87")); 244 if ($sse2) { 245 &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits 246 &cmp ("ecx",1<<26|1<<24); 247 &jne (&label("no_sse2")); 248 &pxor ("xmm0","xmm0"); 249 &pxor ("xmm1","xmm1"); 250 &pxor ("xmm2","xmm2"); 251 &pxor ("xmm3","xmm3"); 252 &pxor ("xmm4","xmm4"); 253 &pxor ("xmm5","xmm5"); 254 &pxor ("xmm6","xmm6"); 255 &pxor ("xmm7","xmm7"); 256 &set_label("no_sse2"); 257 } 258 # just a bunch of fldz to zap the fp/mm bank followed by finit... 259 &data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b); 260&set_label("no_x87"); 261 &lea ("eax",&DWP(4,"esp")); 262 &ret (); 263&function_end_B("OPENSSL_wipe_cpu"); 264 265&function_begin_B("OPENSSL_atomic_add"); 266 &mov ("edx",&DWP(4,"esp")); # fetch the pointer, 1st arg 267 &mov ("ecx",&DWP(8,"esp")); # fetch the increment, 2nd arg 268 &push ("ebx"); 269 &nop (); 270 &mov ("eax",&DWP(0,"edx")); 271&set_label("spin"); 272 &lea ("ebx",&DWP(0,"eax","ecx")); 273 &nop (); 274 &data_word(0x1ab10ff0); # lock; cmpxchg %ebx,(%edx) # %eax is envolved and is always reloaded 275 &jne (&label("spin")); 276 &mov ("eax","ebx"); # OpenSSL expects the new value 277 &pop ("ebx"); 278 &ret (); 279&function_end_B("OPENSSL_atomic_add"); 280 281# This function can become handy under Win32 in situations when 282# we don't know which calling convention, __stdcall or __cdecl(*), 283# indirect callee is using. In C it can be deployed as 284# 285#ifdef OPENSSL_CPUID_OBJ 286# type OPENSSL_indirect_call(void *f,...); 287# ... 288# OPENSSL_indirect_call(func,[up to $max arguments]); 289#endif 290# 291# (*) it's designed to work even for __fastcall if number of 292# arguments is 1 or 2! 293&function_begin_B("OPENSSL_indirect_call"); 294 { 295 my ($max,$i)=(7,); # $max has to be chosen as 4*n-1 296 # in order to preserve eventual 297 # stack alignment 298 &push ("ebp"); 299 &mov ("ebp","esp"); 300 &sub ("esp",$max*4); 301 &mov ("ecx",&DWP(12,"ebp")); 302 &mov (&DWP(0,"esp"),"ecx"); 303 &mov ("edx",&DWP(16,"ebp")); 304 &mov (&DWP(4,"esp"),"edx"); 305 for($i=2;$i<$max;$i++) 306 { 307 # Some copies will be redundant/bogus... 308 &mov ("eax",&DWP(12+$i*4,"ebp")); 309 &mov (&DWP(0+$i*4,"esp"),"eax"); 310 } 311 &call_ptr (&DWP(8,"ebp"));# make the call... 312 &mov ("esp","ebp"); # ... and just restore the stack pointer 313 # without paying attention to what we called, 314 # (__cdecl *func) or (__stdcall *one). 315 &pop ("ebp"); 316 &ret (); 317 } 318&function_end_B("OPENSSL_indirect_call"); 319 320&function_begin_B("OPENSSL_ia32_rdrand"); 321 &mov ("ecx",8); 322&set_label("loop"); 323 &rdrand ("eax"); 324 &jc (&label("break")); 325 &loop (&label("loop")); 326&set_label("break"); 327 &cmp ("eax",0); 328 &cmove ("eax","ecx"); 329 &ret (); 330&function_end_B("OPENSSL_ia32_rdrand"); 331 332&hidden("OPENSSL_ia32cap_P"); 333 334&asm_finish(); 335