1#! /usr/bin/env perl 2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. 18# 19# June 2014 20# 21# Initial version was developed in tight cooperation with Ard 22# Biesheuvel of Linaro from bits-n-pieces from other assembly modules. 23# Just like aesv8-armx.pl this module supports both AArch32 and 24# AArch64 execution modes. 25# 26# July 2014 27# 28# Implement 2x aggregated reduction [see ghash-x86.pl for background 29# information]. 30# 31# November 2017 32# 33# AArch64 register bank to "accommodate" 4x aggregated reduction and 34# improve performance by 20-70% depending on processor. 35# 36# Current performance in cycles per processed byte: 37# 38# 64-bit PMULL 32-bit PMULL 32-bit NEON(*) 39# Apple A7 0.58 0.92 5.62 40# Cortex-A53 0.85 1.01 8.39 41# Cortex-A57 0.73 1.17 7.61 42# Denver 0.51 0.65 6.02 43# Mongoose 0.65 1.10 8.06 44# Kryo 0.76 1.16 8.00 45# 46# (*) presented for reference/comparison purposes; 47 48$flavour = shift; 49$output = shift; 50 51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 52( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 53( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 54die "can't locate arm-xlate.pl"; 55 56open OUT,"| \"$^X\" $xlate $flavour $output"; 57*STDOUT=*OUT; 58 59$Xi="x0"; # argument block 60$Htbl="x1"; 61$inp="x2"; 62$len="x3"; 63 64$inc="x12"; 65 66{ 67my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 68my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); 69 70$code=<<___; 71#include "arm_arch.h" 72 73#if __ARM_MAX_ARCH__>=7 74.text 75___ 76$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 77$code.=<<___ if ($flavour !~ /64/); 78.fpu neon 79.code 32 80#undef __thumb2__ 81___ 82 83################################################################################ 84# void gcm_init_v8(u128 Htable[16],const u64 H[2]); 85# 86# input: 128-bit H - secret parameter E(K,0^128) 87# output: precomputed table filled with degrees of twisted H; 88# H is twisted to handle reverse bitness of GHASH; 89# only few of 16 slots of Htable[16] are used; 90# data is opaque to outside world (which allows to 91# optimize the code independently); 92# 93$code.=<<___; 94.global gcm_init_v8 95.type gcm_init_v8,%function 96.align 4 97gcm_init_v8: 98 vld1.64 {$t1},[x1] @ load input H 99 vmov.i8 $xC2,#0xe1 100 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 101 vext.8 $IN,$t1,$t1,#8 102 vshr.u64 $t2,$xC2,#63 103 vdup.32 $t1,${t1}[1] 104 vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 105 vshr.u64 $t2,$IN,#63 106 vshr.s32 $t1,$t1,#31 @ broadcast carry bit 107 vand $t2,$t2,$t0 108 vshl.i64 $IN,$IN,#1 109 vext.8 $t2,$t2,$t2,#8 110 vand $t0,$t0,$t1 111 vorr $IN,$IN,$t2 @ H<<<=1 112 veor $H,$IN,$t0 @ twisted H 113 vst1.64 {$H},[x0],#16 @ store Htable[0] 114 115 @ calculate H^2 116 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing 117 vpmull.p64 $Xl,$H,$H 118 veor $t0,$t0,$H 119 vpmull2.p64 $Xh,$H,$H 120 vpmull.p64 $Xm,$t0,$t0 121 122 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 123 veor $t2,$Xl,$Xh 124 veor $Xm,$Xm,$t1 125 veor $Xm,$Xm,$t2 126 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 127 128 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 129 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 130 veor $Xl,$Xm,$t2 131 132 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 133 vpmull.p64 $Xl,$Xl,$xC2 134 veor $t2,$t2,$Xh 135 veor $H2,$Xl,$t2 136 137 vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing 138 veor $t1,$t1,$H2 139 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 140 vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] 141___ 142if ($flavour =~ /64/) { 143my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); 144 145$code.=<<___; 146 @ calculate H^3 and H^4 147 vpmull.p64 $Xl,$H, $H2 148 vpmull.p64 $Yl,$H2,$H2 149 vpmull2.p64 $Xh,$H, $H2 150 vpmull2.p64 $Yh,$H2,$H2 151 vpmull.p64 $Xm,$t0,$t1 152 vpmull.p64 $Ym,$t1,$t1 153 154 vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing 155 vext.8 $t1,$Yl,$Yh,#8 156 veor $t2,$Xl,$Xh 157 veor $Xm,$Xm,$t0 158 veor $t3,$Yl,$Yh 159 veor $Ym,$Ym,$t1 160 veor $Xm,$Xm,$t2 161 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase 162 veor $Ym,$Ym,$t3 163 vpmull.p64 $t3,$Yl,$xC2 164 165 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 166 vmov $Yh#lo,$Ym#hi 167 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 168 vmov $Ym#hi,$Yl#lo 169 veor $Xl,$Xm,$t2 170 veor $Yl,$Ym,$t3 171 172 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase 173 vext.8 $t3,$Yl,$Yl,#8 174 vpmull.p64 $Xl,$Xl,$xC2 175 vpmull.p64 $Yl,$Yl,$xC2 176 veor $t2,$t2,$Xh 177 veor $t3,$t3,$Yh 178 veor $H, $Xl,$t2 @ H^3 179 veor $H2,$Yl,$t3 @ H^4 180 181 vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing 182 vext.8 $t1,$H2,$H2,#8 183 veor $t0,$t0,$H 184 veor $t1,$t1,$H2 185 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed 186 vst1.64 {$H-$H2},[x0] @ store Htable[3..5] 187___ 188} 189$code.=<<___; 190 ret 191.size gcm_init_v8,.-gcm_init_v8 192___ 193################################################################################ 194# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); 195# 196# input: Xi - current hash value; 197# Htable - table precomputed in gcm_init_v8; 198# output: Xi - next hash value Xi; 199# 200$code.=<<___; 201.global gcm_gmult_v8 202.type gcm_gmult_v8,%function 203.align 4 204gcm_gmult_v8: 205 vld1.64 {$t1},[$Xi] @ load Xi 206 vmov.i8 $xC2,#0xe1 207 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... 208 vshl.u64 $xC2,$xC2,#57 209#ifndef __ARMEB__ 210 vrev64.8 $t1,$t1 211#endif 212 vext.8 $IN,$t1,$t1,#8 213 214 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 215 veor $t1,$t1,$IN @ Karatsuba pre-processing 216 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 217 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 218 219 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 220 veor $t2,$Xl,$Xh 221 veor $Xm,$Xm,$t1 222 veor $Xm,$Xm,$t2 223 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 224 225 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 226 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 227 veor $Xl,$Xm,$t2 228 229 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 230 vpmull.p64 $Xl,$Xl,$xC2 231 veor $t2,$t2,$Xh 232 veor $Xl,$Xl,$t2 233 234#ifndef __ARMEB__ 235 vrev64.8 $Xl,$Xl 236#endif 237 vext.8 $Xl,$Xl,$Xl,#8 238 vst1.64 {$Xl},[$Xi] @ write out Xi 239 240 ret 241.size gcm_gmult_v8,.-gcm_gmult_v8 242___ 243################################################################################ 244# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); 245# 246# input: table precomputed in gcm_init_v8; 247# current hash value Xi; 248# pointer to input data; 249# length of input data in bytes, but divisible by block size; 250# output: next hash value Xi; 251# 252$code.=<<___; 253.global gcm_ghash_v8 254.type gcm_ghash_v8,%function 255.align 4 256gcm_ghash_v8: 257___ 258$code.=<<___ if ($flavour =~ /64/); 259 cmp $len,#64 260 b.hs .Lgcm_ghash_v8_4x 261___ 262$code.=<<___ if ($flavour !~ /64/); 263 vstmdb sp!,{d8-d15} @ 32-bit ABI says so 264___ 265$code.=<<___; 266 vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 267 @ "[rotated]" means that 268 @ loaded value would have 269 @ to be rotated in order to 270 @ make it appear as in 271 @ algorithm specification 272 subs $len,$len,#32 @ see if $len is 32 or larger 273 mov $inc,#16 @ $inc is used as post- 274 @ increment for input pointer; 275 @ as loop is modulo-scheduled 276 @ $inc is zeroed just in time 277 @ to preclude overstepping 278 @ inp[len], which means that 279 @ last block[s] are actually 280 @ loaded twice, but last 281 @ copy is not processed 282 vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 283 vmov.i8 $xC2,#0xe1 284 vld1.64 {$H2},[$Htbl] 285 cclr $inc,eq @ is it time to zero $inc? 286 vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi 287 vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] 288 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 289#ifndef __ARMEB__ 290 vrev64.8 $t0,$t0 291 vrev64.8 $Xl,$Xl 292#endif 293 vext.8 $IN,$t0,$t0,#8 @ rotate I[0] 294 b.lo .Lodd_tail_v8 @ $len was less than 32 295___ 296{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); 297 ####### 298 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 299 # [(H*Ii+1) + (H*Xi+1)] mod P = 300 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 301 # 302$code.=<<___; 303 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] 304#ifndef __ARMEB__ 305 vrev64.8 $t1,$t1 306#endif 307 vext.8 $In,$t1,$t1,#8 308 veor $IN,$IN,$Xl @ I[i]^=Xi 309 vpmull.p64 $Xln,$H,$In @ H·Ii+1 310 veor $t1,$t1,$In @ Karatsuba pre-processing 311 vpmull2.p64 $Xhn,$H,$In 312 b .Loop_mod2x_v8 313 314.align 4 315.Loop_mod2x_v8: 316 vext.8 $t2,$IN,$IN,#8 317 subs $len,$len,#32 @ is there more data? 318 vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo 319 cclr $inc,lo @ is it time to zero $inc? 320 321 vpmull.p64 $Xmn,$Hhl,$t1 322 veor $t2,$t2,$IN @ Karatsuba pre-processing 323 vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi 324 veor $Xl,$Xl,$Xln @ accumulate 325 vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 326 vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] 327 328 veor $Xh,$Xh,$Xhn 329 cclr $inc,eq @ is it time to zero $inc? 330 veor $Xm,$Xm,$Xmn 331 332 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 333 veor $t2,$Xl,$Xh 334 veor $Xm,$Xm,$t1 335 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] 336#ifndef __ARMEB__ 337 vrev64.8 $t0,$t0 338#endif 339 veor $Xm,$Xm,$t2 340 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 341 342#ifndef __ARMEB__ 343 vrev64.8 $t1,$t1 344#endif 345 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 346 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 347 vext.8 $In,$t1,$t1,#8 348 vext.8 $IN,$t0,$t0,#8 349 veor $Xl,$Xm,$t2 350 vpmull.p64 $Xln,$H,$In @ H·Ii+1 351 veor $IN,$IN,$Xh @ accumulate $IN early 352 353 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 354 vpmull.p64 $Xl,$Xl,$xC2 355 veor $IN,$IN,$t2 356 veor $t1,$t1,$In @ Karatsuba pre-processing 357 veor $IN,$IN,$Xl 358 vpmull2.p64 $Xhn,$H,$In 359 b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes 360 361 veor $Xh,$Xh,$t2 362 vext.8 $IN,$t0,$t0,#8 @ re-construct $IN 363 adds $len,$len,#32 @ re-construct $len 364 veor $Xl,$Xl,$Xh @ re-construct $Xl 365 b.eq .Ldone_v8 @ is $len zero? 366___ 367} 368$code.=<<___; 369.Lodd_tail_v8: 370 vext.8 $t2,$Xl,$Xl,#8 371 veor $IN,$IN,$Xl @ inp^=Xi 372 veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi 373 374 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo 375 veor $t1,$t1,$IN @ Karatsuba pre-processing 376 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi 377 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 378 379 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 380 veor $t2,$Xl,$Xh 381 veor $Xm,$Xm,$t1 382 veor $Xm,$Xm,$t2 383 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 384 385 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 386 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 387 veor $Xl,$Xm,$t2 388 389 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 390 vpmull.p64 $Xl,$Xl,$xC2 391 veor $t2,$t2,$Xh 392 veor $Xl,$Xl,$t2 393 394.Ldone_v8: 395#ifndef __ARMEB__ 396 vrev64.8 $Xl,$Xl 397#endif 398 vext.8 $Xl,$Xl,$Xl,#8 399 vst1.64 {$Xl},[$Xi] @ write out Xi 400 401___ 402$code.=<<___ if ($flavour !~ /64/); 403 vldmia sp!,{d8-d15} @ 32-bit ABI says so 404___ 405$code.=<<___; 406 ret 407.size gcm_ghash_v8,.-gcm_ghash_v8 408___ 409 410if ($flavour =~ /64/) { # 4x subroutine 411my ($I0,$j1,$j2,$j3, 412 $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); 413 414$code.=<<___; 415.type gcm_ghash_v8_4x,%function 416.align 4 417gcm_ghash_v8_4x: 418.Lgcm_ghash_v8_4x: 419 vld1.64 {$Xl},[$Xi] @ load [rotated] Xi 420 vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 421 vmov.i8 $xC2,#0xe1 422 vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 423 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant 424 425 vld1.64 {$I0-$j3},[$inp],#64 426#ifndef __ARMEB__ 427 vrev64.8 $Xl,$Xl 428 vrev64.8 $j1,$j1 429 vrev64.8 $j2,$j2 430 vrev64.8 $j3,$j3 431 vrev64.8 $I0,$I0 432#endif 433 vext.8 $I3,$j3,$j3,#8 434 vext.8 $I2,$j2,$j2,#8 435 vext.8 $I1,$j1,$j1,#8 436 437 vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 438 veor $j3,$j3,$I3 439 vpmull2.p64 $Yh,$H,$I3 440 vpmull.p64 $Ym,$Hhl,$j3 441 442 vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 443 veor $j2,$j2,$I2 444 vpmull2.p64 $I2,$H2,$I2 445 vpmull2.p64 $j2,$Hhl,$j2 446 447 veor $Yl,$Yl,$t0 448 veor $Yh,$Yh,$I2 449 veor $Ym,$Ym,$j2 450 451 vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 452 veor $j1,$j1,$I1 453 vpmull2.p64 $I1,$H3,$I1 454 vpmull.p64 $j1,$H34,$j1 455 456 veor $Yl,$Yl,$j3 457 veor $Yh,$Yh,$I1 458 veor $Ym,$Ym,$j1 459 460 subs $len,$len,#128 461 b.lo .Ltail4x 462 463 b .Loop4x 464 465.align 4 466.Loop4x: 467 veor $t0,$I0,$Xl 468 vld1.64 {$I0-$j3},[$inp],#64 469 vext.8 $IN,$t0,$t0,#8 470#ifndef __ARMEB__ 471 vrev64.8 $j1,$j1 472 vrev64.8 $j2,$j2 473 vrev64.8 $j3,$j3 474 vrev64.8 $I0,$I0 475#endif 476 477 vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 478 veor $t0,$t0,$IN 479 vpmull2.p64 $Xh,$H4,$IN 480 vext.8 $I3,$j3,$j3,#8 481 vpmull2.p64 $Xm,$H34,$t0 482 483 veor $Xl,$Xl,$Yl 484 veor $Xh,$Xh,$Yh 485 vext.8 $I2,$j2,$j2,#8 486 veor $Xm,$Xm,$Ym 487 vext.8 $I1,$j1,$j1,#8 488 489 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 490 veor $t2,$Xl,$Xh 491 vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 492 veor $j3,$j3,$I3 493 veor $Xm,$Xm,$t1 494 vpmull2.p64 $Yh,$H,$I3 495 veor $Xm,$Xm,$t2 496 vpmull.p64 $Ym,$Hhl,$j3 497 498 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 499 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 500 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 501 vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 502 veor $j2,$j2,$I2 503 vpmull2.p64 $I2,$H2,$I2 504 veor $Xl,$Xm,$t2 505 vpmull2.p64 $j2,$Hhl,$j2 506 507 veor $Yl,$Yl,$t0 508 veor $Yh,$Yh,$I2 509 veor $Ym,$Ym,$j2 510 511 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 512 vpmull.p64 $Xl,$Xl,$xC2 513 vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 514 veor $j1,$j1,$I1 515 veor $t2,$t2,$Xh 516 vpmull2.p64 $I1,$H3,$I1 517 vpmull.p64 $j1,$H34,$j1 518 519 veor $Xl,$Xl,$t2 520 veor $Yl,$Yl,$j3 521 veor $Yh,$Yh,$I1 522 vext.8 $Xl,$Xl,$Xl,#8 523 veor $Ym,$Ym,$j1 524 525 subs $len,$len,#64 526 b.hs .Loop4x 527 528.Ltail4x: 529 veor $t0,$I0,$Xl 530 vext.8 $IN,$t0,$t0,#8 531 532 vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) 533 veor $t0,$t0,$IN 534 vpmull2.p64 $Xh,$H4,$IN 535 vpmull2.p64 $Xm,$H34,$t0 536 537 veor $Xl,$Xl,$Yl 538 veor $Xh,$Xh,$Yh 539 veor $Xm,$Xm,$Ym 540 541 adds $len,$len,#64 542 b.eq .Ldone4x 543 544 cmp $len,#32 545 b.lo .Lone 546 b.eq .Ltwo 547.Lthree: 548 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 549 veor $t2,$Xl,$Xh 550 veor $Xm,$Xm,$t1 551 vld1.64 {$I0-$j2},[$inp] 552 veor $Xm,$Xm,$t2 553#ifndef __ARMEB__ 554 vrev64.8 $j1,$j1 555 vrev64.8 $j2,$j2 556 vrev64.8 $I0,$I0 557#endif 558 559 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 560 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 561 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 562 vext.8 $I2,$j2,$j2,#8 563 vext.8 $I1,$j1,$j1,#8 564 veor $Xl,$Xm,$t2 565 566 vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 567 veor $j2,$j2,$I2 568 569 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 570 vpmull.p64 $Xl,$Xl,$xC2 571 veor $t2,$t2,$Xh 572 vpmull2.p64 $Yh,$H,$I2 573 vpmull.p64 $Ym,$Hhl,$j2 574 veor $Xl,$Xl,$t2 575 vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 576 veor $j1,$j1,$I1 577 vext.8 $Xl,$Xl,$Xl,#8 578 579 vpmull2.p64 $I1,$H2,$I1 580 veor $t0,$I0,$Xl 581 vpmull2.p64 $j1,$Hhl,$j1 582 vext.8 $IN,$t0,$t0,#8 583 584 veor $Yl,$Yl,$j3 585 veor $Yh,$Yh,$I1 586 veor $Ym,$Ym,$j1 587 588 vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) 589 veor $t0,$t0,$IN 590 vpmull2.p64 $Xh,$H3,$IN 591 vpmull.p64 $Xm,$H34,$t0 592 593 veor $Xl,$Xl,$Yl 594 veor $Xh,$Xh,$Yh 595 veor $Xm,$Xm,$Ym 596 b .Ldone4x 597 598.align 4 599.Ltwo: 600 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 601 veor $t2,$Xl,$Xh 602 veor $Xm,$Xm,$t1 603 vld1.64 {$I0-$j1},[$inp] 604 veor $Xm,$Xm,$t2 605#ifndef __ARMEB__ 606 vrev64.8 $j1,$j1 607 vrev64.8 $I0,$I0 608#endif 609 610 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 611 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 612 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 613 vext.8 $I1,$j1,$j1,#8 614 veor $Xl,$Xm,$t2 615 616 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 617 vpmull.p64 $Xl,$Xl,$xC2 618 veor $t2,$t2,$Xh 619 veor $Xl,$Xl,$t2 620 vext.8 $Xl,$Xl,$Xl,#8 621 622 vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 623 veor $j1,$j1,$I1 624 625 veor $t0,$I0,$Xl 626 vext.8 $IN,$t0,$t0,#8 627 628 vpmull2.p64 $Yh,$H,$I1 629 vpmull.p64 $Ym,$Hhl,$j1 630 631 vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) 632 veor $t0,$t0,$IN 633 vpmull2.p64 $Xh,$H2,$IN 634 vpmull2.p64 $Xm,$Hhl,$t0 635 636 veor $Xl,$Xl,$Yl 637 veor $Xh,$Xh,$Yh 638 veor $Xm,$Xm,$Ym 639 b .Ldone4x 640 641.align 4 642.Lone: 643 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 644 veor $t2,$Xl,$Xh 645 veor $Xm,$Xm,$t1 646 vld1.64 {$I0},[$inp] 647 veor $Xm,$Xm,$t2 648#ifndef __ARMEB__ 649 vrev64.8 $I0,$I0 650#endif 651 652 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 653 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 654 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 655 veor $Xl,$Xm,$t2 656 657 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 658 vpmull.p64 $Xl,$Xl,$xC2 659 veor $t2,$t2,$Xh 660 veor $Xl,$Xl,$t2 661 vext.8 $Xl,$Xl,$Xl,#8 662 663 veor $t0,$I0,$Xl 664 vext.8 $IN,$t0,$t0,#8 665 666 vpmull.p64 $Xl,$H,$IN 667 veor $t0,$t0,$IN 668 vpmull2.p64 $Xh,$H,$IN 669 vpmull.p64 $Xm,$Hhl,$t0 670 671.Ldone4x: 672 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing 673 veor $t2,$Xl,$Xh 674 veor $Xm,$Xm,$t1 675 veor $Xm,$Xm,$t2 676 677 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction 678 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result 679 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl 680 veor $Xl,$Xm,$t2 681 682 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction 683 vpmull.p64 $Xl,$Xl,$xC2 684 veor $t2,$t2,$Xh 685 veor $Xl,$Xl,$t2 686 vext.8 $Xl,$Xl,$Xl,#8 687 688#ifndef __ARMEB__ 689 vrev64.8 $Xl,$Xl 690#endif 691 vst1.64 {$Xl},[$Xi] @ write out Xi 692 693 ret 694.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 695___ 696 697} 698} 699 700$code.=<<___; 701.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 702.align 2 703#endif 704___ 705 706if ($flavour =~ /64/) { ######## 64-bit code 707 sub unvmov { 708 my $arg=shift; 709 710 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 711 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, 712 $3<8?$3:$3+8,($4 eq "lo")?0:1; 713 } 714 foreach(split("\n",$code)) { 715 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 716 s/vmov\.i8/movi/o or # fix up legacy mnemonics 717 s/vmov\s+(.*)/unvmov($1)/geo or 718 s/vext\.8/ext/o or 719 s/vshr\.s/sshr\.s/o or 720 s/vshr/ushr/o or 721 s/^(\s+)v/$1/o or # strip off v prefix 722 s/\bbx\s+lr\b/ret/o; 723 724 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 725 s/@\s/\/\//o; # old->new style commentary 726 727 # fix up remaining legacy suffixes 728 s/\.[ui]?8(\s)/$1/o; 729 s/\.[uis]?32//o and s/\.16b/\.4s/go; 730 m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument 731 m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments 732 s/\.[uisp]?64//o and s/\.16b/\.2d/go; 733 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 734 735 print $_,"\n"; 736 } 737} else { ######## 32-bit code 738 sub unvdup32 { 739 my $arg=shift; 740 741 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 742 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 743 } 744 sub unvpmullp64 { 745 my ($mnemonic,$arg)=@_; 746 747 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 748 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 749 |(($2&7)<<17)|(($2&8)<<4) 750 |(($3&7)<<1) |(($3&8)<<2); 751 $word |= 0x00010001 if ($mnemonic =~ "2"); 752 # since ARMv7 instructions are always encoded little-endian. 753 # correct solution is to use .inst directive, but older 754 # assemblers don't implement it:-( 755 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 756 $word&0xff,($word>>8)&0xff, 757 ($word>>16)&0xff,($word>>24)&0xff, 758 $mnemonic,$arg; 759 } 760 } 761 762 foreach(split("\n",$code)) { 763 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 764 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 765 s/\/\/\s?/@ /o; # new->old style commentary 766 767 # fix up remaining new-style suffixes 768 s/\],#[0-9]+/]!/o; 769 770 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 771 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 772 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 773 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 774 s/^(\s+)b\./$1b/o or 775 s/^(\s+)ret/$1bx\tlr/o; 776 777 print $_,"\n"; 778 } 779} 780 781close STDOUT or die "error closing STDOUT: $!"; # enforce flush 782