1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+128 bytes shared table]. Streamed 22# GHASH performance was measured to be 6.67 cycles per processed byte 23# on Itanium 2, which is >90% better than Microsoft compiler generated 24# code. To anchor to something else sha1-ia64.pl module processes one 25# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per 26# byte. 27 28# September 2010 29# 30# It was originally thought that it makes lesser sense to implement 31# "528B" variant on Itanium 2 for following reason. Because number of 32# functional units is naturally limited, it appeared impossible to 33# implement "528B" loop in 4 cycles, only in 5. This would mean that 34# theoretically performance improvement couldn't be more than 20%. 35# But occasionally you prove yourself wrong:-) I figured out a way to 36# fold couple of instructions and having freed yet another instruction 37# slot by unrolling the loop... Resulting performance is 4.45 cycles 38# per processed byte and 50% better than "256B" version. On original 39# Itanium performance should remain the same as the "256B" version, 40# i.e. ~8.5 cycles. 41 42$output=pop and (open STDOUT,">$output" or die "can't open $output: $!"); 43 44if ($^O eq "hpux") { 45 $ADDP="addp4"; 46 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 47} else { $ADDP="add"; } 48for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 49 $big_endian=0 if (/\-DL_ENDIAN/); } 50if (!defined($big_endian)) 51 { $big_endian=(unpack('L',pack('N',1))==1); } 52 53sub loop() { 54my $label=shift; 55my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp 56 57# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. 58# in scalable manner;-) Naturally assuming data in L1 cache... 59# Special note about 'dep' instruction, which is used to construct 60# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 61# bytes boundary and lower 7 bits of its address are guaranteed to 62# be zero. 63$code.=<<___; 64$label: 65{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 66 (p19) dep rem=Zlo,rem_4bitp,3,4 } 67{ .mfi; (p19) xor Zhi=Zhi,Hhi 68 ($p17) xor xi[1]=xi[1],in[1] };; 69{ .mfi; (p18) ld8 Hhi=[Hi[1]] 70 (p19) shrp Zlo=Zhi,Zlo,4 } 71{ .mfi; (p19) ld8 rem=[rem] 72 (p18) and Hi[1]=mask0xf0,xi[2] };; 73{ .mmi; ($p16) ld1 in[0]=[inp],-1 74 (p18) xor Zlo=Zlo,Hlo 75 (p19) shr.u Zhi=Zhi,4 } 76{ .mib; (p19) xor Hhi=Hhi,rem 77 (p18) add Hi[1]=Htbl,Hi[1] };; 78 79{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 80 (p18) dep rem=Zlo,rem_4bitp,3,4 } 81{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 82 (p18) xor Zhi=Zhi,Hhi };; 83{ .mfi; (p18) ld8 Hhi=[Hi[1]] 84 (p18) shrp Zlo=Zhi,Zlo,4 } 85{ .mfi; (p18) ld8 rem=[rem] 86 (p17) and Hi[0]=mask0xf0,Hi[0] };; 87{ .mmi; (p16) ld1 xi[0]=[Xi],-1 88 (p18) xor Zlo=Zlo,Hlo 89 (p18) shr.u Zhi=Zhi,4 } 90{ .mib; (p18) xor Hhi=Hhi,rem 91 (p17) add Hi[0]=Htbl,Hi[0] 92 br.ctop.sptk $label };; 93___ 94} 95 96$code=<<___; 97.explicit 98.text 99 100prevfs=r2; prevlc=r3; prevpr=r8; 101mask0xf0=r21; 102rem=r22; rem_4bitp=r23; 103Xi=r24; Htbl=r25; 104inp=r26; end=r27; 105Hhi=r28; Hlo=r29; 106Zhi=r30; Zlo=r31; 107 108.align 128 109.skip 16 // aligns loop body 110.global gcm_gmult_4bit# 111.proc gcm_gmult_4bit# 112gcm_gmult_4bit: 113 .prologue 114{ .mmi; .save ar.pfs,prevfs 115 alloc prevfs=ar.pfs,2,6,0,8 116 $ADDP Xi=15,in0 // &Xi[15] 117 mov rem_4bitp=ip } 118{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo 119 .save ar.lc,prevlc 120 mov prevlc=ar.lc 121 .save pr,prevpr 122 mov prevpr=pr };; 123 124 .body 125 .rotr in[3],xi[3],Hi[2] 126 127{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] 128 mov mask0xf0=0xf0 129 brp.loop.imp .Loop1,.Lend1-16};; 130{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] 131 };; 132{ .mii; shladd Hi[1]=xi[2],4,r0 133 mov pr.rot=0x7<<16 134 mov ar.lc=13 };; 135{ .mii; and Hi[1]=mask0xf0,Hi[1] 136 mov ar.ec=3 137 xor Zlo=Zlo,Zlo };; 138{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo 139 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp 140 xor Zhi=Zhi,Zhi };; 141___ 142 &loop (".Loop1",1); 143$code.=<<___; 144.Lend1: 145{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact 146{ .mib; mux1 Zlo=Zlo,\@rev };; 147{ .mib; mux1 Zhi=Zhi,\@rev };; 148{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent 149 add Hhi=1,Xi };; // pipeline flush on Itanium 150{ .mib; st8 [Hlo]=Zlo 151 mov pr=prevpr,0x1ffff };; 152{ .mib; st8 [Hhi]=Zhi 153 mov ar.lc=prevlc 154 br.ret.sptk.many b0 };; 155.endp gcm_gmult_4bit# 156___ 157 158###################################################################### 159# "528B" (well, "512B" actually) streamed GHASH 160# 161$Xip="in0"; 162$Htbl="in1"; 163$inp="in2"; 164$len="in3"; 165$rem_8bit="loc0"; 166$mask0xff="loc1"; 167($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); 168 169sub load_htable() { 170 for (my $i=0;$i<8;$i++) { 171 $code.=<<___; 172{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi 173 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo 174{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi 175 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo 176___ 177 $code.=shift if (($i+$#_)==7); 178 $code.="\t};;\n" 179 } 180} 181 182$code.=<<___; 183prevsp=r3; 184 185.align 32 186.skip 16 // aligns loop body 187.global gcm_ghash_4bit# 188.proc gcm_ghash_4bit# 189gcm_ghash_4bit: 190 .prologue 191{ .mmi; .save ar.pfs,prevfs 192 alloc prevfs=ar.pfs,4,2,0,0 193 .vframe prevsp 194 mov prevsp=sp 195 mov $rem_8bit=ip };; 196 .body 197{ .mfi; $ADDP r8=0+0,$Htbl 198 $ADDP r9=0+8,$Htbl } 199{ .mfi; $ADDP r10=128+0,$Htbl 200 $ADDP r11=128+8,$Htbl };; 201___ 202 &load_htable( 203 " $ADDP $Xip=15,$Xip", # &Xi[15] 204 " $ADDP $len=$len,$inp", # &inp[len] 205 " $ADDP $inp=15,$inp", # &inp[15] 206 " mov $mask0xff=0xff", 207 " add sp=-512,sp", 208 " andcm sp=sp,$mask0xff", # align stack frame 209 " add r14=0,sp", 210 " add r15=8,sp"); 211$code.=<<___; 212{ .mmi; $sum 1<<1 // go big-endian 213 add r8=256+0,sp 214 add r9=256+8,sp } 215{ .mmi; add r10=256+128+0,sp 216 add r11=256+128+8,sp 217 add $len=-17,$len };; 218___ 219for($i=0;$i<8;$i++) { # generate first half of Hshr4[] 220my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); 221$code.=<<___; 222{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo 223 st8 [r9]=$rhi,16 // Htable[$i].hi 224 shrp $rlo=$rhi,$rlo,4 }//;; 225{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo 226 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi 227 shr.u $rhi=$rhi,4 };; 228{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 229 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 230___ 231} 232$code.=<<___; 233{ .mmi; ld8 r16=[r8],16 // Htable[8].lo 234 ld8 r17=[r9],16 };; // Htable[8].hi 235{ .mmi; ld8 r18=[r8],16 // Htable[9].lo 236 ld8 r19=[r9],16 } // Htable[9].hi 237{ .mmi; rum 1<<5 // clear um.mfh 238 shrp r16=r17,r16,4 };; 239___ 240for($i=0;$i<6;$i++) { # generate second half of Hshr4[] 241$code.=<<___; 242{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo 243 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi 244 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 245{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 248___ 249} 250$code.=<<___; 251{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 252{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 253 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 254 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 255{ .mmi; add $Htbl=256,sp // &Htable[0] 256 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit 257 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; 258{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 259 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 260___ 261 262$in="r15"; 263@xi=("r16","r17"); 264@rem=("r18","r19"); 265($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); 266($Atbl,$Btbl)=("r26","r27"); 267 268$code.=<<___; # (p16) 269{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- 270 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 271 cmp.eq p0,p6=r0,r0 };; // clear p6 272___ 273push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 274 275$code.=<<___; # (p16),(p17) 276{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 277 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 278{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- 279 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo 280 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 281.align 32 282.LOOP: 283{ .mmi; 284(p6) st8 [$Xip]=$Zhi,13 285 xor $Zlo=$Zlo,$Zlo 286 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo 287___ 288push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 289 290$code.=<<___; # (p16),(p17),(p18) 291{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 292 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 293 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 294{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 295 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 296{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 297 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo 298{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 299 ld1 $in=[$inp],-1 } //(p16) *inp-- 300{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 301 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi 302 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 303{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 304 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 305 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 306{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 307 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 308___ 309push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 310 311for ($i=1;$i<14;$i++) { 312# Above and below fragments are derived from this one by removing 313# unsuitable (p??) instructions. 314$code.=<<___; # (p16),(p17),(p18),(p19) 315{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 316 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 317 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 318{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 319 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 320 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 321{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 322 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 323 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 324{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 325 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 326 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 327{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 328 ld1 $in=[$inp],-1 //(p16) *inp-- 329 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 330{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 331 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 332 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 333{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 334 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 335 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 336{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 337 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 338 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 339___ 340push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 341} 342 343$code.=<<___; # (p17),(p18),(p19) 344{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 345 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 346 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 347{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 348 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 349 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 350{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 351 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 352 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo 353{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 354 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 355 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 356{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 357 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 358{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 359 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 360 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 361{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 362 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 363{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 364 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 365 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 366___ 367push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 368 369$code.=<<___; # (p18),(p19) 370{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 371 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 372{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 373 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo 374{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 375 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo 376{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 377 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 378{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi 379 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 380{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 381 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi 382{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi 383 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) 384{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 385 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 386___ 387push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 388 389$code.=<<___; # (p19) 390{ .mmi; cmp.ltu p6,p0=$inp,$len 391 add $inp=32,$inp 392 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 393{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 394 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 395 add $Xip=9,$Xip };; // &Xi.lo 396{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 397(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 398(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] 399{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi 400(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] 401{ .mmi; st8 [$Xip]=$Zlo,-8 402(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] 403 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 404{ .mmi; 405(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 406 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 407(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo 408{ .mib; 409(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 410(p6) br.cond.dptk.many .LOOP };; 411 412{ .mib; st8 [$Xip]=$Zhi };; 413{ .mib; $rum 1<<1 // return to little-endian 414 .restore sp 415 mov sp=prevsp 416 br.ret.sptk.many b0 };; 417.endp gcm_ghash_4bit# 418___ 419$code.=<<___; 420.align 128 421.type rem_4bit#,\@object 422rem_4bit: 423 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 424 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 425 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 426 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 427.size rem_4bit#,128 428.type rem_8bit#,\@object 429rem_8bit: 430 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E 431 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E 432 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E 433 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E 434 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E 435 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E 436 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E 437 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E 438 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE 439 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE 440 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE 441 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE 442 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E 443 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E 444 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE 445 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE 446 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E 447 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E 448 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E 449 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E 450 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E 451 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E 452 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E 453 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E 454 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE 455 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE 456 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE 457 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE 458 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E 459 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E 460 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE 461 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE 462.size rem_8bit#,512 463stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" 464___ 465 466$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); 467$code =~ s/\`([^\`]*)\`/eval $1/gem; 468 469print $code; 470close STDOUT or die "error closing STDOUT: $!"; 471