1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# March 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. Even though 15# loops are aggressively modulo-scheduled in respect to references to 16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is 17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic 18# scheduling "glitch," because uprofile(1) indicates uniform sample 19# distribution, as if all instruction bundles execute in 1.5 cycles. 20# Meaning that it could have been even faster, yet 12 cycles is ~60% 21# better than gcc-generated code and ~80% than code generated by vendor 22# compiler. 23 24$cnt="v0"; # $0 25$t0="t0"; 26$t1="t1"; 27$t2="t2"; 28$Thi0="t3"; # $4 29$Tlo0="t4"; 30$Thi1="t5"; 31$Tlo1="t6"; 32$rem="t7"; # $8 33################# 34$Xi="a0"; # $16, input argument block 35$Htbl="a1"; 36$inp="a2"; 37$len="a3"; 38$nlo="a4"; # $20 39$nhi="a5"; 40$Zhi="t8"; 41$Zlo="t9"; 42$Xhi="t10"; # $24 43$Xlo="t11"; 44$remp="t12"; 45$rem_4bit="AT"; # $28 46 47{ my $N; 48 sub loop() { 49 50 $N++; 51$code.=<<___; 52.align 4 53 extbl $Xlo,7,$nlo 54 and $nlo,0xf0,$nhi 55 sll $nlo,4,$nlo 56 and $nlo,0xf0,$nlo 57 58 addq $nlo,$Htbl,$nlo 59 ldq $Zlo,8($nlo) 60 addq $nhi,$Htbl,$nhi 61 ldq $Zhi,0($nlo) 62 63 and $Zlo,0x0f,$remp 64 sll $Zhi,60,$t0 65 lda $cnt,6(zero) 66 extbl $Xlo,6,$nlo 67 68 ldq $Tlo1,8($nhi) 69 s8addq $remp,$rem_4bit,$remp 70 ldq $Thi1,0($nhi) 71 srl $Zlo,4,$Zlo 72 73 ldq $rem,0($remp) 74 srl $Zhi,4,$Zhi 75 xor $t0,$Zlo,$Zlo 76 and $nlo,0xf0,$nhi 77 78 xor $Tlo1,$Zlo,$Zlo 79 sll $nlo,4,$nlo 80 xor $Thi1,$Zhi,$Zhi 81 and $nlo,0xf0,$nlo 82 83 addq $nlo,$Htbl,$nlo 84 ldq $Tlo0,8($nlo) 85 addq $nhi,$Htbl,$nhi 86 ldq $Thi0,0($nlo) 87 88.Looplo$N: 89 and $Zlo,0x0f,$remp 90 sll $Zhi,60,$t0 91 subq $cnt,1,$cnt 92 srl $Zlo,4,$Zlo 93 94 ldq $Tlo1,8($nhi) 95 xor $rem,$Zhi,$Zhi 96 ldq $Thi1,0($nhi) 97 s8addq $remp,$rem_4bit,$remp 98 99 ldq $rem,0($remp) 100 srl $Zhi,4,$Zhi 101 xor $t0,$Zlo,$Zlo 102 extbl $Xlo,$cnt,$nlo 103 104 and $nlo,0xf0,$nhi 105 xor $Thi0,$Zhi,$Zhi 106 xor $Tlo0,$Zlo,$Zlo 107 sll $nlo,4,$nlo 108 109 110 and $Zlo,0x0f,$remp 111 sll $Zhi,60,$t0 112 and $nlo,0xf0,$nlo 113 srl $Zlo,4,$Zlo 114 115 s8addq $remp,$rem_4bit,$remp 116 xor $rem,$Zhi,$Zhi 117 addq $nlo,$Htbl,$nlo 118 addq $nhi,$Htbl,$nhi 119 120 ldq $rem,0($remp) 121 srl $Zhi,4,$Zhi 122 ldq $Tlo0,8($nlo) 123 xor $t0,$Zlo,$Zlo 124 125 xor $Tlo1,$Zlo,$Zlo 126 xor $Thi1,$Zhi,$Zhi 127 ldq $Thi0,0($nlo) 128 bne $cnt,.Looplo$N 129 130 131 and $Zlo,0x0f,$remp 132 sll $Zhi,60,$t0 133 lda $cnt,7(zero) 134 srl $Zlo,4,$Zlo 135 136 ldq $Tlo1,8($nhi) 137 xor $rem,$Zhi,$Zhi 138 ldq $Thi1,0($nhi) 139 s8addq $remp,$rem_4bit,$remp 140 141 ldq $rem,0($remp) 142 srl $Zhi,4,$Zhi 143 xor $t0,$Zlo,$Zlo 144 extbl $Xhi,$cnt,$nlo 145 146 and $nlo,0xf0,$nhi 147 xor $Thi0,$Zhi,$Zhi 148 xor $Tlo0,$Zlo,$Zlo 149 sll $nlo,4,$nlo 150 151 and $Zlo,0x0f,$remp 152 sll $Zhi,60,$t0 153 and $nlo,0xf0,$nlo 154 srl $Zlo,4,$Zlo 155 156 s8addq $remp,$rem_4bit,$remp 157 xor $rem,$Zhi,$Zhi 158 addq $nlo,$Htbl,$nlo 159 addq $nhi,$Htbl,$nhi 160 161 ldq $rem,0($remp) 162 srl $Zhi,4,$Zhi 163 ldq $Tlo0,8($nlo) 164 xor $t0,$Zlo,$Zlo 165 166 xor $Tlo1,$Zlo,$Zlo 167 xor $Thi1,$Zhi,$Zhi 168 ldq $Thi0,0($nlo) 169 unop 170 171 172.Loophi$N: 173 and $Zlo,0x0f,$remp 174 sll $Zhi,60,$t0 175 subq $cnt,1,$cnt 176 srl $Zlo,4,$Zlo 177 178 ldq $Tlo1,8($nhi) 179 xor $rem,$Zhi,$Zhi 180 ldq $Thi1,0($nhi) 181 s8addq $remp,$rem_4bit,$remp 182 183 ldq $rem,0($remp) 184 srl $Zhi,4,$Zhi 185 xor $t0,$Zlo,$Zlo 186 extbl $Xhi,$cnt,$nlo 187 188 and $nlo,0xf0,$nhi 189 xor $Thi0,$Zhi,$Zhi 190 xor $Tlo0,$Zlo,$Zlo 191 sll $nlo,4,$nlo 192 193 194 and $Zlo,0x0f,$remp 195 sll $Zhi,60,$t0 196 and $nlo,0xf0,$nlo 197 srl $Zlo,4,$Zlo 198 199 s8addq $remp,$rem_4bit,$remp 200 xor $rem,$Zhi,$Zhi 201 addq $nlo,$Htbl,$nlo 202 addq $nhi,$Htbl,$nhi 203 204 ldq $rem,0($remp) 205 srl $Zhi,4,$Zhi 206 ldq $Tlo0,8($nlo) 207 xor $t0,$Zlo,$Zlo 208 209 xor $Tlo1,$Zlo,$Zlo 210 xor $Thi1,$Zhi,$Zhi 211 ldq $Thi0,0($nlo) 212 bne $cnt,.Loophi$N 213 214 215 and $Zlo,0x0f,$remp 216 sll $Zhi,60,$t0 217 srl $Zlo,4,$Zlo 218 219 ldq $Tlo1,8($nhi) 220 xor $rem,$Zhi,$Zhi 221 ldq $Thi1,0($nhi) 222 s8addq $remp,$rem_4bit,$remp 223 224 ldq $rem,0($remp) 225 srl $Zhi,4,$Zhi 226 xor $t0,$Zlo,$Zlo 227 228 xor $Tlo0,$Zlo,$Zlo 229 xor $Thi0,$Zhi,$Zhi 230 231 and $Zlo,0x0f,$remp 232 sll $Zhi,60,$t0 233 srl $Zlo,4,$Zlo 234 235 s8addq $remp,$rem_4bit,$remp 236 xor $rem,$Zhi,$Zhi 237 238 ldq $rem,0($remp) 239 srl $Zhi,4,$Zhi 240 xor $Tlo1,$Zlo,$Zlo 241 xor $Thi1,$Zhi,$Zhi 242 xor $t0,$Zlo,$Zlo 243 xor $rem,$Zhi,$Zhi 244___ 245}} 246 247$code=<<___; 248#ifdef __linux__ 249#include <asm/regdef.h> 250#else 251#include <asm.h> 252#include <regdef.h> 253#endif 254 255.text 256 257.set noat 258.set noreorder 259.globl gcm_gmult_4bit 260.align 4 261.ent gcm_gmult_4bit 262gcm_gmult_4bit: 263 .frame sp,0,ra 264 .prologue 0 265 266 ldq $Xlo,8($Xi) 267 ldq $Xhi,0($Xi) 268 269 br $rem_4bit,.Lpic1 270.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) 271___ 272 273 &loop(); 274 275$code.=<<___; 276 srl $Zlo,24,$t0 # byte swap 277 srl $Zlo,8,$t1 278 279 sll $Zlo,8,$t2 280 sll $Zlo,24,$Zlo 281 zapnot $t0,0x11,$t0 282 zapnot $t1,0x22,$t1 283 284 zapnot $Zlo,0x88,$Zlo 285 or $t0,$t1,$t0 286 zapnot $t2,0x44,$t2 287 288 or $Zlo,$t0,$Zlo 289 srl $Zhi,24,$t0 290 srl $Zhi,8,$t1 291 292 or $Zlo,$t2,$Zlo 293 sll $Zhi,8,$t2 294 sll $Zhi,24,$Zhi 295 296 srl $Zlo,32,$Xlo 297 sll $Zlo,32,$Zlo 298 299 zapnot $t0,0x11,$t0 300 zapnot $t1,0x22,$t1 301 or $Zlo,$Xlo,$Xlo 302 303 zapnot $Zhi,0x88,$Zhi 304 or $t0,$t1,$t0 305 zapnot $t2,0x44,$t2 306 307 or $Zhi,$t0,$Zhi 308 or $Zhi,$t2,$Zhi 309 310 srl $Zhi,32,$Xhi 311 sll $Zhi,32,$Zhi 312 313 or $Zhi,$Xhi,$Xhi 314 stq $Xlo,8($Xi) 315 stq $Xhi,0($Xi) 316 317 ret (ra) 318.end gcm_gmult_4bit 319___ 320 321$inhi="s0"; 322$inlo="s1"; 323 324$code.=<<___; 325.globl gcm_ghash_4bit 326.align 4 327.ent gcm_ghash_4bit 328gcm_ghash_4bit: 329 lda sp,-32(sp) 330 stq ra,0(sp) 331 stq s0,8(sp) 332 stq s1,16(sp) 333 .mask 0x04000600,-32 334 .frame sp,32,ra 335 .prologue 0 336 337 ldq_u $inhi,0($inp) 338 ldq_u $Thi0,7($inp) 339 ldq_u $inlo,8($inp) 340 ldq_u $Tlo0,15($inp) 341 ldq $Xhi,0($Xi) 342 ldq $Xlo,8($Xi) 343 344 br $rem_4bit,.Lpic2 345.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) 346 347.Louter: 348 extql $inhi,$inp,$inhi 349 extqh $Thi0,$inp,$Thi0 350 or $inhi,$Thi0,$inhi 351 lda $inp,16($inp) 352 353 extql $inlo,$inp,$inlo 354 extqh $Tlo0,$inp,$Tlo0 355 or $inlo,$Tlo0,$inlo 356 subq $len,16,$len 357 358 xor $Xlo,$inlo,$Xlo 359 xor $Xhi,$inhi,$Xhi 360___ 361 362 &loop(); 363 364$code.=<<___; 365 srl $Zlo,24,$t0 # byte swap 366 srl $Zlo,8,$t1 367 368 sll $Zlo,8,$t2 369 sll $Zlo,24,$Zlo 370 zapnot $t0,0x11,$t0 371 zapnot $t1,0x22,$t1 372 373 zapnot $Zlo,0x88,$Zlo 374 or $t0,$t1,$t0 375 zapnot $t2,0x44,$t2 376 377 or $Zlo,$t0,$Zlo 378 srl $Zhi,24,$t0 379 srl $Zhi,8,$t1 380 381 or $Zlo,$t2,$Zlo 382 sll $Zhi,8,$t2 383 sll $Zhi,24,$Zhi 384 385 srl $Zlo,32,$Xlo 386 sll $Zlo,32,$Zlo 387 beq $len,.Ldone 388 389 zapnot $t0,0x11,$t0 390 zapnot $t1,0x22,$t1 391 or $Zlo,$Xlo,$Xlo 392 ldq_u $inhi,0($inp) 393 394 zapnot $Zhi,0x88,$Zhi 395 or $t0,$t1,$t0 396 zapnot $t2,0x44,$t2 397 ldq_u $Thi0,7($inp) 398 399 or $Zhi,$t0,$Zhi 400 or $Zhi,$t2,$Zhi 401 ldq_u $inlo,8($inp) 402 ldq_u $Tlo0,15($inp) 403 404 srl $Zhi,32,$Xhi 405 sll $Zhi,32,$Zhi 406 407 or $Zhi,$Xhi,$Xhi 408 br zero,.Louter 409 410.Ldone: 411 zapnot $t0,0x11,$t0 412 zapnot $t1,0x22,$t1 413 or $Zlo,$Xlo,$Xlo 414 415 zapnot $Zhi,0x88,$Zhi 416 or $t0,$t1,$t0 417 zapnot $t2,0x44,$t2 418 419 or $Zhi,$t0,$Zhi 420 or $Zhi,$t2,$Zhi 421 422 srl $Zhi,32,$Xhi 423 sll $Zhi,32,$Zhi 424 425 or $Zhi,$Xhi,$Xhi 426 427 stq $Xlo,8($Xi) 428 stq $Xhi,0($Xi) 429 430 .set noreorder 431 /*ldq ra,0(sp)*/ 432 ldq s0,8(sp) 433 ldq s1,16(sp) 434 lda sp,32(sp) 435 ret (ra) 436.end gcm_ghash_4bit 437 438.align 4 439rem_4bit: 440 .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 441 .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 442 .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 443 .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 444.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 445.align 4 446 447___ 448$output=shift and open STDOUT,">$output"; 449print $code; 450close STDOUT; 451 452