1#! /usr/bin/env perl 2# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# April 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+32 bytes shared table]. There is no 22# experimental performance data available yet. The only approximation 23# that can be made at this point is based on code size. Inner loop is 24# 32 instructions long and on single-issue core should execute in <40 25# cycles. Having verified that gcc 3.4 didn't unroll corresponding 26# loop, this assembler loop body was found to be ~3x smaller than 27# compiler-generated one... 28# 29# July 2010 30# 31# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on 32# Cortex A8 core and ~25 cycles per processed byte (which was observed 33# to be ~3 times faster than gcc-generated code:-) 34# 35# February 2011 36# 37# Profiler-assisted and platform-specific optimization resulted in 7% 38# improvement on Cortex A8 core and ~23.5 cycles per byte. 39# 40# March 2011 41# 42# Add NEON implementation featuring polynomial multiplication, i.e. no 43# lookup tables involved. On Cortex A8 it was measured to process one 44# byte in 15 cycles or 55% faster than integer-only code. 45# 46# April 2014 47# 48# Switch to multiplication algorithm suggested in paper referred 49# below and combine it with reduction algorithm from x86 module. 50# Performance improvement over previous version varies from 65% on 51# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 52# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, 53# Snapdragon S4 - in 9.33. 54# 55# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software 56# Polynomial Multiplication on ARM Processors using the NEON Engine. 57# 58# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf 59 60# ==================================================================== 61# Note about "528B" variant. In ARM case it makes lesser sense to 62# implement it for following reasons: 63# 64# - performance improvement won't be anywhere near 50%, because 128- 65# bit shift operation is neatly fused with 128-bit xor here, and 66# "538B" variant would eliminate only 4-5 instructions out of 32 67# in the inner loop (meaning that estimated improvement is ~15%); 68# - ARM-based systems are often embedded ones and extra memory 69# consumption might be unappreciated (for so little improvement); 70# 71# Byte order [in]dependence. ========================================= 72# 73# Caller is expected to maintain specific *dword* order in Htable, 74# namely with *least* significant dword of 128-bit value at *lower* 75# address. This differs completely from C code and has everything to 76# do with ldm instruction and order in which dwords are "consumed" by 77# algorithm. *Byte* order within these dwords in turn is whatever 78# *native* byte order on current platform. See gcm128.c for working 79# example... 80 81$flavour = shift; 82if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 83else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 84 85if ($flavour && $flavour ne "void") { 86 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 87 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 88 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 89 die "can't locate arm-xlate.pl"; 90 91 open OUT,"| \"$^X\" $xlate $flavour $output"; 92 *STDOUT=*OUT; 93} else { 94 open OUT,">$output"; 95 *STDOUT=*OUT; 96} 97 98$Xi="r0"; # argument block 99$Htbl="r1"; 100$inp="r2"; 101$len="r3"; 102 103$Zll="r4"; # variables 104$Zlh="r5"; 105$Zhl="r6"; 106$Zhh="r7"; 107$Tll="r8"; 108$Tlh="r9"; 109$Thl="r10"; 110$Thh="r11"; 111$nlo="r12"; 112################# r13 is stack pointer 113$nhi="r14"; 114################# r15 is program counter 115 116$rem_4bit=$inp; # used in gcm_gmult_4bit 117$cnt=$len; 118 119sub Zsmash() { 120 my $i=12; 121 my @args=@_; 122 for ($Zll,$Zlh,$Zhl,$Zhh) { 123 $code.=<<___; 124#if __ARM_ARCH__>=7 && defined(__ARMEL__) 125 rev $_,$_ 126 str $_,[$Xi,#$i] 127#elif defined(__ARMEB__) 128 str $_,[$Xi,#$i] 129#else 130 mov $Tlh,$_,lsr#8 131 strb $_,[$Xi,#$i+3] 132 mov $Thl,$_,lsr#16 133 strb $Tlh,[$Xi,#$i+2] 134 mov $Thh,$_,lsr#24 135 strb $Thl,[$Xi,#$i+1] 136 strb $Thh,[$Xi,#$i] 137#endif 138___ 139 $code.="\t".shift(@args)."\n"; 140 $i-=4; 141 } 142} 143 144$code=<<___; 145#include <openssl/arm_arch.h> 146 147@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 148@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL 149@ instructions are in aesv8-armx.pl.) 150.arch armv7-a 151 152.text 153#if defined(__thumb2__) || defined(__clang__) 154.syntax unified 155#define ldrplb ldrbpl 156#define ldrneb ldrbne 157#endif 158#if defined(__thumb2__) 159.thumb 160#else 161.code 32 162#endif 163 164.type rem_4bit,%object 165.align 5 166rem_4bit: 167.short 0x0000,0x1C20,0x3840,0x2460 168.short 0x7080,0x6CA0,0x48C0,0x54E0 169.short 0xE100,0xFD20,0xD940,0xC560 170.short 0x9180,0x8DA0,0xA9C0,0xB5E0 171.size rem_4bit,.-rem_4bit 172 173.type rem_4bit_get,%function 174rem_4bit_get: 175#if defined(__thumb2__) 176 adr $rem_4bit,rem_4bit 177#else 178 sub $rem_4bit,pc,#8+32 @ &rem_4bit 179#endif 180 b .Lrem_4bit_got 181 nop 182 nop 183.size rem_4bit_get,.-rem_4bit_get 184 185.global gcm_ghash_4bit 186.type gcm_ghash_4bit,%function 187.align 4 188gcm_ghash_4bit: 189#if defined(__thumb2__) 190 adr r12,rem_4bit 191#else 192 sub r12,pc,#8+48 @ &rem_4bit 193#endif 194 add $len,$inp,$len @ $len to point at the end 195 stmdb sp!,{r3-r11,lr} @ save $len/end too 196 197 ldmia r12,{r4-r11} @ copy rem_4bit ... 198 stmdb sp!,{r4-r11} @ ... to stack 199 200 ldrb $nlo,[$inp,#15] 201 ldrb $nhi,[$Xi,#15] 202.Louter: 203 eor $nlo,$nlo,$nhi 204 and $nhi,$nlo,#0xf0 205 and $nlo,$nlo,#0x0f 206 mov $cnt,#14 207 208 add $Zhh,$Htbl,$nlo,lsl#4 209 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 210 add $Thh,$Htbl,$nhi 211 ldrb $nlo,[$inp,#14] 212 213 and $nhi,$Zll,#0xf @ rem 214 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 215 add $nhi,$nhi,$nhi 216 eor $Zll,$Tll,$Zll,lsr#4 217 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] 218 eor $Zll,$Zll,$Zlh,lsl#28 219 ldrb $nhi,[$Xi,#14] 220 eor $Zlh,$Tlh,$Zlh,lsr#4 221 eor $Zlh,$Zlh,$Zhl,lsl#28 222 eor $Zhl,$Thl,$Zhl,lsr#4 223 eor $Zhl,$Zhl,$Zhh,lsl#28 224 eor $Zhh,$Thh,$Zhh,lsr#4 225 eor $nlo,$nlo,$nhi 226 and $nhi,$nlo,#0xf0 227 and $nlo,$nlo,#0x0f 228 eor $Zhh,$Zhh,$Tll,lsl#16 229 230.Linner: 231 add $Thh,$Htbl,$nlo,lsl#4 232 and $nlo,$Zll,#0xf @ rem 233 subs $cnt,$cnt,#1 234 add $nlo,$nlo,$nlo 235 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 236 eor $Zll,$Tll,$Zll,lsr#4 237 eor $Zll,$Zll,$Zlh,lsl#28 238 eor $Zlh,$Tlh,$Zlh,lsr#4 239 eor $Zlh,$Zlh,$Zhl,lsl#28 240 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] 241 eor $Zhl,$Thl,$Zhl,lsr#4 242#ifdef __thumb2__ 243 it pl 244#endif 245 ldrplb $nlo,[$inp,$cnt] 246 eor $Zhl,$Zhl,$Zhh,lsl#28 247 eor $Zhh,$Thh,$Zhh,lsr#4 248 249 add $Thh,$Htbl,$nhi 250 and $nhi,$Zll,#0xf @ rem 251 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 252 add $nhi,$nhi,$nhi 253 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 254 eor $Zll,$Tll,$Zll,lsr#4 255#ifdef __thumb2__ 256 it pl 257#endif 258 ldrplb $Tll,[$Xi,$cnt] 259 eor $Zll,$Zll,$Zlh,lsl#28 260 eor $Zlh,$Tlh,$Zlh,lsr#4 261 ldrh $Tlh,[sp,$nhi] 262 eor $Zlh,$Zlh,$Zhl,lsl#28 263 eor $Zhl,$Thl,$Zhl,lsr#4 264 eor $Zhl,$Zhl,$Zhh,lsl#28 265#ifdef __thumb2__ 266 it pl 267#endif 268 eorpl $nlo,$nlo,$Tll 269 eor $Zhh,$Thh,$Zhh,lsr#4 270#ifdef __thumb2__ 271 itt pl 272#endif 273 andpl $nhi,$nlo,#0xf0 274 andpl $nlo,$nlo,#0x0f 275 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] 276 bpl .Linner 277 278 ldr $len,[sp,#32] @ re-load $len/end 279 add $inp,$inp,#16 280 mov $nhi,$Zll 281___ 282 &Zsmash("cmp\t$inp,$len","\n". 283 "#ifdef __thumb2__\n". 284 " it ne\n". 285 "#endif\n". 286 " ldrneb $nlo,[$inp,#15]"); 287$code.=<<___; 288 bne .Louter 289 290 add sp,sp,#36 291#if __ARM_ARCH__>=5 292 ldmia sp!,{r4-r11,pc} 293#else 294 ldmia sp!,{r4-r11,lr} 295 tst lr,#1 296 moveq pc,lr @ be binary compatible with V4, yet 297 bx lr @ interoperable with Thumb ISA:-) 298#endif 299.size gcm_ghash_4bit,.-gcm_ghash_4bit 300 301.global gcm_gmult_4bit 302.type gcm_gmult_4bit,%function 303gcm_gmult_4bit: 304 stmdb sp!,{r4-r11,lr} 305 ldrb $nlo,[$Xi,#15] 306 b rem_4bit_get 307.Lrem_4bit_got: 308 and $nhi,$nlo,#0xf0 309 and $nlo,$nlo,#0x0f 310 mov $cnt,#14 311 312 add $Zhh,$Htbl,$nlo,lsl#4 313 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 314 ldrb $nlo,[$Xi,#14] 315 316 add $Thh,$Htbl,$nhi 317 and $nhi,$Zll,#0xf @ rem 318 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 319 add $nhi,$nhi,$nhi 320 eor $Zll,$Tll,$Zll,lsr#4 321 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 322 eor $Zll,$Zll,$Zlh,lsl#28 323 eor $Zlh,$Tlh,$Zlh,lsr#4 324 eor $Zlh,$Zlh,$Zhl,lsl#28 325 eor $Zhl,$Thl,$Zhl,lsr#4 326 eor $Zhl,$Zhl,$Zhh,lsl#28 327 eor $Zhh,$Thh,$Zhh,lsr#4 328 and $nhi,$nlo,#0xf0 329 eor $Zhh,$Zhh,$Tll,lsl#16 330 and $nlo,$nlo,#0x0f 331 332.Loop: 333 add $Thh,$Htbl,$nlo,lsl#4 334 and $nlo,$Zll,#0xf @ rem 335 subs $cnt,$cnt,#1 336 add $nlo,$nlo,$nlo 337 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 338 eor $Zll,$Tll,$Zll,lsr#4 339 eor $Zll,$Zll,$Zlh,lsl#28 340 eor $Zlh,$Tlh,$Zlh,lsr#4 341 eor $Zlh,$Zlh,$Zhl,lsl#28 342 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] 343 eor $Zhl,$Thl,$Zhl,lsr#4 344#ifdef __thumb2__ 345 it pl 346#endif 347 ldrplb $nlo,[$Xi,$cnt] 348 eor $Zhl,$Zhl,$Zhh,lsl#28 349 eor $Zhh,$Thh,$Zhh,lsr#4 350 351 add $Thh,$Htbl,$nhi 352 and $nhi,$Zll,#0xf @ rem 353 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 354 add $nhi,$nhi,$nhi 355 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 356 eor $Zll,$Tll,$Zll,lsr#4 357 eor $Zll,$Zll,$Zlh,lsl#28 358 eor $Zlh,$Tlh,$Zlh,lsr#4 359 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 360 eor $Zlh,$Zlh,$Zhl,lsl#28 361 eor $Zhl,$Thl,$Zhl,lsr#4 362 eor $Zhl,$Zhl,$Zhh,lsl#28 363 eor $Zhh,$Thh,$Zhh,lsr#4 364#ifdef __thumb2__ 365 itt pl 366#endif 367 andpl $nhi,$nlo,#0xf0 368 andpl $nlo,$nlo,#0x0f 369 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 370 bpl .Loop 371___ 372 &Zsmash(); 373$code.=<<___; 374#if __ARM_ARCH__>=5 375 ldmia sp!,{r4-r11,pc} 376#else 377 ldmia sp!,{r4-r11,lr} 378 tst lr,#1 379 moveq pc,lr @ be binary compatible with V4, yet 380 bx lr @ interoperable with Thumb ISA:-) 381#endif 382.size gcm_gmult_4bit,.-gcm_gmult_4bit 383___ 384{ 385my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 386my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); 387my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); 388 389sub clmul64x64 { 390my ($r,$a,$b)=@_; 391$code.=<<___; 392 vext.8 $t0#lo, $a, $a, #1 @ A1 393 vmull.p8 $t0, $t0#lo, $b @ F = A1*B 394 vext.8 $r#lo, $b, $b, #1 @ B1 395 vmull.p8 $r, $a, $r#lo @ E = A*B1 396 vext.8 $t1#lo, $a, $a, #2 @ A2 397 vmull.p8 $t1, $t1#lo, $b @ H = A2*B 398 vext.8 $t3#lo, $b, $b, #2 @ B2 399 vmull.p8 $t3, $a, $t3#lo @ G = A*B2 400 vext.8 $t2#lo, $a, $a, #3 @ A3 401 veor $t0, $t0, $r @ L = E + F 402 vmull.p8 $t2, $t2#lo, $b @ J = A3*B 403 vext.8 $r#lo, $b, $b, #3 @ B3 404 veor $t1, $t1, $t3 @ M = G + H 405 vmull.p8 $r, $a, $r#lo @ I = A*B3 406 veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 407 vand $t0#hi, $t0#hi, $k48 408 vext.8 $t3#lo, $b, $b, #4 @ B4 409 veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 410 vand $t1#hi, $t1#hi, $k32 411 vmull.p8 $t3, $a, $t3#lo @ K = A*B4 412 veor $t2, $t2, $r @ N = I + J 413 veor $t0#lo, $t0#lo, $t0#hi 414 veor $t1#lo, $t1#lo, $t1#hi 415 veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 416 vand $t2#hi, $t2#hi, $k16 417 vext.8 $t0, $t0, $t0, #15 418 veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 419 vmov.i64 $t3#hi, #0 420 vext.8 $t1, $t1, $t1, #14 421 veor $t2#lo, $t2#lo, $t2#hi 422 vmull.p8 $r, $a, $b @ D = A*B 423 vext.8 $t3, $t3, $t3, #12 424 vext.8 $t2, $t2, $t2, #13 425 veor $t0, $t0, $t1 426 veor $t2, $t2, $t3 427 veor $r, $r, $t0 428 veor $r, $r, $t2 429___ 430} 431 432$code.=<<___; 433#if __ARM_MAX_ARCH__>=7 434.arch armv7-a 435.fpu neon 436 437.global gcm_init_neon 438.type gcm_init_neon,%function 439.align 4 440gcm_init_neon: 441 vld1.64 $IN#hi,[r1]! @ load H 442 vmov.i8 $t0,#0xe1 443 vld1.64 $IN#lo,[r1] 444 vshl.i64 $t0#hi,#57 445 vshr.u64 $t0#lo,#63 @ t0=0xc2....01 446 vdup.8 $t1,$IN#hi[7] 447 vshr.u64 $Hlo,$IN#lo,#63 448 vshr.s8 $t1,#7 @ broadcast carry bit 449 vshl.i64 $IN,$IN,#1 450 vand $t0,$t0,$t1 451 vorr $IN#hi,$Hlo @ H<<<=1 452 veor $IN,$IN,$t0 @ twisted H 453 vstmia r0,{$IN} 454 455 ret @ bx lr 456.size gcm_init_neon,.-gcm_init_neon 457 458.global gcm_gmult_neon 459.type gcm_gmult_neon,%function 460.align 4 461gcm_gmult_neon: 462 vld1.64 $IN#hi,[$Xi]! @ load Xi 463 vld1.64 $IN#lo,[$Xi]! 464 vmov.i64 $k48,#0x0000ffffffffffff 465 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 466 vmov.i64 $k32,#0x00000000ffffffff 467#ifdef __ARMEL__ 468 vrev64.8 $IN,$IN 469#endif 470 vmov.i64 $k16,#0x000000000000ffff 471 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 472 mov $len,#16 473 b .Lgmult_neon 474.size gcm_gmult_neon,.-gcm_gmult_neon 475 476.global gcm_ghash_neon 477.type gcm_ghash_neon,%function 478.align 4 479gcm_ghash_neon: 480 vld1.64 $Xl#hi,[$Xi]! @ load Xi 481 vld1.64 $Xl#lo,[$Xi]! 482 vmov.i64 $k48,#0x0000ffffffffffff 483 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 484 vmov.i64 $k32,#0x00000000ffffffff 485#ifdef __ARMEL__ 486 vrev64.8 $Xl,$Xl 487#endif 488 vmov.i64 $k16,#0x000000000000ffff 489 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 490 491.Loop_neon: 492 vld1.64 $IN#hi,[$inp]! @ load inp 493 vld1.64 $IN#lo,[$inp]! 494#ifdef __ARMEL__ 495 vrev64.8 $IN,$IN 496#endif 497 veor $IN,$Xl @ inp^=Xi 498.Lgmult_neon: 499___ 500 &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo 501$code.=<<___; 502 veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing 503___ 504 &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) 505 &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi 506$code.=<<___; 507 veor $Xm,$Xm,$Xl @ Karatsuba post-processing 508 veor $Xm,$Xm,$Xh 509 veor $Xl#hi,$Xl#hi,$Xm#lo 510 veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result 511 512 @ equivalent of reduction_avx from ghash-x86_64.pl 513 vshl.i64 $t1,$Xl,#57 @ 1st phase 514 vshl.i64 $t2,$Xl,#62 515 veor $t2,$t2,$t1 @ 516 vshl.i64 $t1,$Xl,#63 517 veor $t2, $t2, $t1 @ 518 veor $Xl#hi,$Xl#hi,$t2#lo @ 519 veor $Xh#lo,$Xh#lo,$t2#hi 520 521 vshr.u64 $t2,$Xl,#1 @ 2nd phase 522 veor $Xh,$Xh,$Xl 523 veor $Xl,$Xl,$t2 @ 524 vshr.u64 $t2,$t2,#6 525 vshr.u64 $Xl,$Xl,#1 @ 526 veor $Xl,$Xl,$Xh @ 527 veor $Xl,$Xl,$t2 @ 528 529 subs $len,#16 530 bne .Loop_neon 531 532#ifdef __ARMEL__ 533 vrev64.8 $Xl,$Xl 534#endif 535 sub $Xi,#16 536 vst1.64 $Xl#hi,[$Xi]! @ write out Xi 537 vst1.64 $Xl#lo,[$Xi] 538 539 ret @ bx lr 540.size gcm_ghash_neon,.-gcm_ghash_neon 541#endif 542___ 543} 544$code.=<<___; 545.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 546.align 2 547___ 548 549foreach (split("\n",$code)) { 550 s/\`([^\`]*)\`/eval $1/geo; 551 552 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 553 s/\bret\b/bx lr/go or 554 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 555 556 print $_,"\n"; 557} 558close STDOUT or die "error closing STDOUT"; # enforce flush 559