1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. 11# 12# June 2014 13# 14# Initial version was developed in tight cooperation with Ard 15# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from 16# other assembly modules. Just like aesv8-armx.pl this module 17# supports both AArch32 and AArch64 execution modes. 18# 19# July 2014 20# 21# Implement 2x aggregated reduction [see ghash-x86.pl for background 22# information]. 23# 24# Current performance in cycles per processed byte: 25# 26# PMULL[2] 32-bit NEON(*) 27# Apple A7 0.92 5.62 28# Cortex-A53 1.01 8.39 29# Cortex-A57 1.17 7.61 30# Denver 0.71 6.02 31# 32# (*) presented for reference/comparison purposes; 33 34$flavour = shift; 35$output = shift; 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 40die "can't locate arm-xlate.pl"; 41 42open OUT,"| \"$^X\" $xlate $flavour $output"; 43*STDOUT=*OUT; 44 45$Xi="x0"; # argument block 46$Htbl="x1"; 47$inp="x2"; 48$len="x3"; 49 50$inc="x12"; 51 52{ 53my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 54my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); 55 56$code=<<___; 57#include <openssl/arm_arch.h> 58 59.text 60___ 61$code.=<<___ if ($flavour =~ /64/); 62#if !defined(__clang__) 63.arch armv8-a+crypto 64#endif 65___ 66$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); 67 68################################################################################ 69# void gcm_init_v8(u128 Htable[16],const u64 H[2]); 70# 71# input: 128-bit H - secret parameter E(K,0^128) 72# output: precomputed table filled with degrees of twisted H; 73# H is twisted to handle reverse bitness of GHASH; 74# only few of 16 slots of Htable[16] are used; 75# data is opaque to outside world (which allows to 76# optimize the code independently); 77# 78$code.=<<___; 79.global gcm_init_v8 80.type gcm_init_v8,%function 81.align 4 82gcm_init_v8: 83 vld1.64 {$t1},[x1] @ load input H 84 vmov.i8 $xC2,#0xe1 85 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 86 vext.8 $IN,$t1,$t1,#8 87 vshr.u64 $t2,$xC2,#63 88 vdup.32 $t1,${t1}[1] 89 vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 90 vshr.u64 $t2,$IN,#63 91 vshr.s32 $t1,$t1,#31 @ broadcast carry bit 92 vand $t2,$t2,$t0 93 vshl.i64 $IN,$IN,#1 94 vext.8 $t2,$t2,$t2,#8 95 vand $t0,$t0,$t1 96 vorr $IN,$IN,$t2 @ H<<<=1 97 veor $H,$IN,$t0 @ twisted H 98 vst1.64 {$H},[x0],#16 @ store Htable[0] 99 100 @ calculate H^2 101 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing 102 vpmull.p64 $Xl,$H,$H 103 veor $t0,$t0,$H 104 vpmull2.p64 $Xh,$H,$H 105 vpmull.p64 $Xm,$t0,$t0 106 107 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 108 veor $t2,$Xl,$Xh 109 veor $Xm,$Xm,$t1 110 veor $Xm,$Xm,$t2 111 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 112 113 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 114 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 115 veor $Xl,$Xm,$t2 116 117 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 118 vpmull.p64 $Xl,$Xl,$xC2 119 veor $t2,$t2,$Xh 120 veor $H2,$Xl,$t2 121 122 vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing 123 veor $t1,$t1,$H2 124 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 125 vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2] 126 127 ret 128.size gcm_init_v8,.-gcm_init_v8 129___ 130################################################################################ 131# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); 132# 133# input: Xi - current hash value; 134# Htable - table precomputed in gcm_init_v8; 135# output: Xi - next hash value Xi; 136# 137$code.=<<___; 138.global gcm_gmult_v8 139.type gcm_gmult_v8,%function 140.align 4 141gcm_gmult_v8: 142 vld1.64 {$t1},[$Xi] @ load Xi 143 vmov.i8 $xC2,#0xe1 144 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... 145 vshl.u64 $xC2,$xC2,#57 146#ifndef __ARMEB__ 147 vrev64.8 $t1,$t1 148#endif 149 vext.8 $IN,$t1,$t1,#8 150 151 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 152 veor $t1,$t1,$IN @ Karatsuba pre-processing 153 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 154 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 155 156 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 157 veor $t2,$Xl,$Xh 158 veor $Xm,$Xm,$t1 159 veor $Xm,$Xm,$t2 160 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 161 162 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 163 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 164 veor $Xl,$Xm,$t2 165 166 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 167 vpmull.p64 $Xl,$Xl,$xC2 168 veor $t2,$t2,$Xh 169 veor $Xl,$Xl,$t2 170 171#ifndef __ARMEB__ 172 vrev64.8 $Xl,$Xl 173#endif 174 vext.8 $Xl,$Xl,$Xl,#8 175 vst1.64 {$Xl},[$Xi] @ write out Xi 176 177 ret 178.size gcm_gmult_v8,.-gcm_gmult_v8 179___ 180################################################################################ 181# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 182# 183# input: table precomputed in gcm_init_v8; 184# current hash value Xi; 185# pointer to input data; 186# length of input data in bytes, but divisible by block size; 187# output: next hash value Xi; 188# 189$code.=<<___; 190.global gcm_ghash_v8 191.type gcm_ghash_v8,%function 192.align 4 193gcm_ghash_v8: 194___ 195$code.=<<___ if ($flavour !~ /64/); 196 vstmdb sp!,{d8-d15} @ 32-bit ABI says so 197___ 198$code.=<<___; 199 vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 200 @ "[rotated]" means that 201 @ loaded value would have 202 @ to be rotated in order to 203 @ make it appear as in 204 @ alorithm specification 205 subs $len,$len,#32 @ see if $len is 32 or larger 206 mov $inc,#16 @ $inc is used as post- 207 @ increment for input pointer; 208 @ as loop is modulo-scheduled 209 @ $inc is zeroed just in time 210 @ to preclude oversteping 211 @ inp[len], which means that 212 @ last block[s] are actually 213 @ loaded twice, but last 214 @ copy is not processed 215 vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 216 vmov.i8 $xC2,#0xe1 217 vld1.64 {$H2},[$Htbl] 218 cclr $inc,eq @ is it time to zero $inc? 219 vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi 220 vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] 221 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 222#ifndef __ARMEB__ 223 vrev64.8 $t0,$t0 224 vrev64.8 $Xl,$Xl 225#endif 226 vext.8 $IN,$t0,$t0,#8 @ rotate I[0] 227 b.lo .Lodd_tail_v8 @ $len was less than 32 228___ 229{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); 230 ####### 231 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 232 # [(H*Ii+1) + (H*Xi+1)] mod P = 233 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 234 # 235$code.=<<___; 236 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] 237#ifndef __ARMEB__ 238 vrev64.8 $t1,$t1 239#endif 240 vext.8 $In,$t1,$t1,#8 241 veor $IN,$IN,$Xl @ I[i]^=Xi 242 vpmull.p64 $Xln,$H,$In @ H·Ii+1 243 veor $t1,$t1,$In @ Karatsuba pre-processing 244 vpmull2.p64 $Xhn,$H,$In 245 b .Loop_mod2x_v8 246 247.align 4 248.Loop_mod2x_v8: 249 vext.8 $t2,$IN,$IN,#8 250 subs $len,$len,#32 @ is there more data? 251 vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo 252 cclr $inc,lo @ is it time to zero $inc? 253 254 vpmull.p64 $Xmn,$Hhl,$t1 255 veor $t2,$t2,$IN @ Karatsuba pre-processing 256 vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi 257 veor $Xl,$Xl,$Xln @ accumulate 258 vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 259 vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] 260 261 veor $Xh,$Xh,$Xhn 262 cclr $inc,eq @ is it time to zero $inc? 263 veor $Xm,$Xm,$Xmn 264 265 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 266 veor $t2,$Xl,$Xh 267 veor $Xm,$Xm,$t1 268 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] 269#ifndef __ARMEB__ 270 vrev64.8 $t0,$t0 271#endif 272 veor $Xm,$Xm,$t2 273 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 274 275#ifndef __ARMEB__ 276 vrev64.8 $t1,$t1 277#endif 278 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 279 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 280 vext.8 $In,$t1,$t1,#8 281 vext.8 $IN,$t0,$t0,#8 282 veor $Xl,$Xm,$t2 283 vpmull.p64 $Xln,$H,$In @ H·Ii+1 284 veor $IN,$IN,$Xh @ accumulate $IN early 285 286 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 287 vpmull.p64 $Xl,$Xl,$xC2 288 veor $IN,$IN,$t2 289 veor $t1,$t1,$In @ Karatsuba pre-processing 290 veor $IN,$IN,$Xl 291 vpmull2.p64 $Xhn,$H,$In 292 b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes 293 294 veor $Xh,$Xh,$t2 295 vext.8 $IN,$t0,$t0,#8 @ re-construct $IN 296 adds $len,$len,#32 @ re-construct $len 297 veor $Xl,$Xl,$Xh @ re-construct $Xl 298 b.eq .Ldone_v8 @ is $len zero? 299___ 300} 301$code.=<<___; 302.Lodd_tail_v8: 303 vext.8 $t2,$Xl,$Xl,#8 304 veor $IN,$IN,$Xl @ inp^=Xi 305 veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi 306 307 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 308 veor $t1,$t1,$IN @ Karatsuba pre-processing 309 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 310 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 311 312 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 313 veor $t2,$Xl,$Xh 314 veor $Xm,$Xm,$t1 315 veor $Xm,$Xm,$t2 316 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 317 318 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 319 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 320 veor $Xl,$Xm,$t2 321 322 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 323 vpmull.p64 $Xl,$Xl,$xC2 324 veor $t2,$t2,$Xh 325 veor $Xl,$Xl,$t2 326 327.Ldone_v8: 328#ifndef __ARMEB__ 329 vrev64.8 $Xl,$Xl 330#endif 331 vext.8 $Xl,$Xl,$Xl,#8 332 vst1.64 {$Xl},[$Xi] @ write out Xi 333 334___ 335$code.=<<___ if ($flavour !~ /64/); 336 vldmia sp!,{d8-d15} @ 32-bit ABI says so 337___ 338$code.=<<___; 339 ret 340.size gcm_ghash_v8,.-gcm_ghash_v8 341___ 342} 343$code.=<<___; 344.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 345.align 2 346___ 347 348if ($flavour =~ /64/) { ######## 64-bit code 349 sub unvmov { 350 my $arg=shift; 351 352 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 353 sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; 354 } 355 foreach(split("\n",$code)) { 356 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 357 s/vmov\.i8/movi/o or # fix up legacy mnemonics 358 s/vmov\s+(.*)/unvmov($1)/geo or 359 s/vext\.8/ext/o or 360 s/vshr\.s/sshr\.s/o or 361 s/vshr/ushr/o or 362 s/^(\s+)v/$1/o or # strip off v prefix 363 s/\bbx\s+lr\b/ret/o; 364 365 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 366 s/@\s/\/\//o; # old->new style commentary 367 368 # fix up remainig legacy suffixes 369 s/\.[ui]?8(\s)/$1/o; 370 s/\.[uis]?32//o and s/\.16b/\.4s/go; 371 m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument 372 m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments 373 s/\.[uisp]?64//o and s/\.16b/\.2d/go; 374 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 375 376 print $_,"\n"; 377 } 378} else { ######## 32-bit code 379 sub unvdup32 { 380 my $arg=shift; 381 382 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 383 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 384 } 385 sub unvpmullp64 { 386 my ($mnemonic,$arg)=@_; 387 388 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 389 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 390 |(($2&7)<<17)|(($2&8)<<4) 391 |(($3&7)<<1) |(($3&8)<<2); 392 $word |= 0x00010001 if ($mnemonic =~ "2"); 393 # since ARMv7 instructions are always encoded little-endian. 394 # correct solution is to use .inst directive, but older 395 # assemblers don't implement it:-( 396 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 397 $word&0xff,($word>>8)&0xff, 398 ($word>>16)&0xff,($word>>24)&0xff, 399 $mnemonic,$arg; 400 } 401 } 402 403 foreach(split("\n",$code)) { 404 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 405 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 406 s/\/\/\s?/@ /o; # new->old style commentary 407 408 # fix up remainig new-style suffixes 409 s/\],#[0-9]+/]!/o; 410 411 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 412 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 413 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 414 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 415 s/^(\s+)b\./$1b/o or 416 s/^(\s+)ret/$1bx\tlr/o; 417 418 print $_,"\n"; 419 } 420} 421 422close STDOUT; # enforce flush 423