1#! /usr/bin/env perl 2# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# April 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+32 bytes shared table]. There is no 22# experimental performance data available yet. The only approximation 23# that can be made at this point is based on code size. Inner loop is 24# 32 instructions long and on single-issue core should execute in <40 25# cycles. Having verified that gcc 3.4 didn't unroll corresponding 26# loop, this assembler loop body was found to be ~3x smaller than 27# compiler-generated one... 28# 29# July 2010 30# 31# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on 32# Cortex A8 core and ~25 cycles per processed byte (which was observed 33# to be ~3 times faster than gcc-generated code:-) 34# 35# February 2011 36# 37# Profiler-assisted and platform-specific optimization resulted in 7% 38# improvement on Cortex A8 core and ~23.5 cycles per byte. 39# 40# March 2011 41# 42# Add NEON implementation featuring polynomial multiplication, i.e. no 43# lookup tables involved. On Cortex A8 it was measured to process one 44# byte in 15 cycles or 55% faster than integer-only code. 45# 46# April 2014 47# 48# Switch to multiplication algorithm suggested in paper referred 49# below and combine it with reduction algorithm from x86 module. 50# Performance improvement over previous version varies from 65% on 51# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 52# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, 53# Snapdragon S4 - in 9.33. 54# 55# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software 56# Polynomial Multiplication on ARM Processors using the NEON Engine. 57# 58# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf 59 60# ==================================================================== 61# Note about "528B" variant. In ARM case it makes lesser sense to 62# implement it for following reasons: 63# 64# - performance improvement won't be anywhere near 50%, because 128- 65# bit shift operation is neatly fused with 128-bit xor here, and 66# "538B" variant would eliminate only 4-5 instructions out of 32 67# in the inner loop (meaning that estimated improvement is ~15%); 68# - ARM-based systems are often embedded ones and extra memory 69# consumption might be unappreciated (for so little improvement); 70# 71# Byte order [in]dependence. ========================================= 72# 73# Caller is expected to maintain specific *dword* order in Htable, 74# namely with *least* significant dword of 128-bit value at *lower* 75# address. This differs completely from C code and has everything to 76# do with ldm instruction and order in which dwords are "consumed" by 77# algorithm. *Byte* order within these dwords in turn is whatever 78# *native* byte order on current platform. See gcm128.c for working 79# example... 80 81# This file was patched in BoringSSL to remove the variable-time 4-bit 82# implementation. 83 84$flavour = shift; 85if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 86else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 87 88if ($flavour && $flavour ne "void") { 89 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 90 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 91 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 92 die "can't locate arm-xlate.pl"; 93 94 open OUT,"| \"$^X\" $xlate $flavour $output"; 95 *STDOUT=*OUT; 96} else { 97 open OUT,">$output"; 98 *STDOUT=*OUT; 99} 100 101$Xi="r0"; # argument block 102$Htbl="r1"; 103$inp="r2"; 104$len="r3"; 105 106$code=<<___; 107#include <GFp/arm_arch.h> 108 109@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 110@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL 111@ instructions are in aesv8-armx.pl.) 112.arch armv7-a 113 114.text 115#if defined(__thumb2__) || defined(__clang__) 116.syntax unified 117#define ldrplb ldrbpl 118#define ldrneb ldrbne 119#endif 120#if defined(__thumb2__) 121.thumb 122#else 123.code 32 124#endif 125___ 126{ 127my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 128my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); 129my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); 130 131sub clmul64x64 { 132my ($r,$a,$b)=@_; 133$code.=<<___; 134 vext.8 $t0#lo, $a, $a, #1 @ A1 135 vmull.p8 $t0, $t0#lo, $b @ F = A1*B 136 vext.8 $r#lo, $b, $b, #1 @ B1 137 vmull.p8 $r, $a, $r#lo @ E = A*B1 138 vext.8 $t1#lo, $a, $a, #2 @ A2 139 vmull.p8 $t1, $t1#lo, $b @ H = A2*B 140 vext.8 $t3#lo, $b, $b, #2 @ B2 141 vmull.p8 $t3, $a, $t3#lo @ G = A*B2 142 vext.8 $t2#lo, $a, $a, #3 @ A3 143 veor $t0, $t0, $r @ L = E + F 144 vmull.p8 $t2, $t2#lo, $b @ J = A3*B 145 vext.8 $r#lo, $b, $b, #3 @ B3 146 veor $t1, $t1, $t3 @ M = G + H 147 vmull.p8 $r, $a, $r#lo @ I = A*B3 148 veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 149 vand $t0#hi, $t0#hi, $k48 150 vext.8 $t3#lo, $b, $b, #4 @ B4 151 veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 152 vand $t1#hi, $t1#hi, $k32 153 vmull.p8 $t3, $a, $t3#lo @ K = A*B4 154 veor $t2, $t2, $r @ N = I + J 155 veor $t0#lo, $t0#lo, $t0#hi 156 veor $t1#lo, $t1#lo, $t1#hi 157 veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 158 vand $t2#hi, $t2#hi, $k16 159 vext.8 $t0, $t0, $t0, #15 160 veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 161 vmov.i64 $t3#hi, #0 162 vext.8 $t1, $t1, $t1, #14 163 veor $t2#lo, $t2#lo, $t2#hi 164 vmull.p8 $r, $a, $b @ D = A*B 165 vext.8 $t3, $t3, $t3, #12 166 vext.8 $t2, $t2, $t2, #13 167 veor $t0, $t0, $t1 168 veor $t2, $t2, $t3 169 veor $r, $r, $t0 170 veor $r, $r, $t2 171___ 172} 173 174$code.=<<___; 175#if __ARM_MAX_ARCH__>=7 176.arch armv7-a 177.fpu neon 178 179.global GFp_gcm_init_neon 180.type GFp_gcm_init_neon,%function 181.align 4 182GFp_gcm_init_neon: 183 vld1.64 $IN#hi,[r1]! @ load H 184 vmov.i8 $t0,#0xe1 185 vld1.64 $IN#lo,[r1] 186 vshl.i64 $t0#hi,#57 187 vshr.u64 $t0#lo,#63 @ t0=0xc2....01 188 vdup.8 $t1,$IN#hi[7] 189 vshr.u64 $Hlo,$IN#lo,#63 190 vshr.s8 $t1,#7 @ broadcast carry bit 191 vshl.i64 $IN,$IN,#1 192 vand $t0,$t0,$t1 193 vorr $IN#hi,$Hlo @ H<<<=1 194 veor $IN,$IN,$t0 @ twisted H 195 vstmia r0,{$IN} 196 197 ret @ bx lr 198.size GFp_gcm_init_neon,.-GFp_gcm_init_neon 199 200.global GFp_gcm_gmult_neon 201.type GFp_gcm_gmult_neon,%function 202.align 4 203GFp_gcm_gmult_neon: 204 vld1.64 $IN#hi,[$Xi]! @ load Xi 205 vld1.64 $IN#lo,[$Xi]! 206 vmov.i64 $k48,#0x0000ffffffffffff 207 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 208 vmov.i64 $k32,#0x00000000ffffffff 209#ifdef __ARMEL__ 210 vrev64.8 $IN,$IN 211#endif 212 vmov.i64 $k16,#0x000000000000ffff 213 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 214 mov $len,#16 215 b .Lgmult_neon 216.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon 217 218.global GFp_gcm_ghash_neon 219.type GFp_gcm_ghash_neon,%function 220.align 4 221GFp_gcm_ghash_neon: 222 vld1.64 $Xl#hi,[$Xi]! @ load Xi 223 vld1.64 $Xl#lo,[$Xi]! 224 vmov.i64 $k48,#0x0000ffffffffffff 225 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 226 vmov.i64 $k32,#0x00000000ffffffff 227#ifdef __ARMEL__ 228 vrev64.8 $Xl,$Xl 229#endif 230 vmov.i64 $k16,#0x000000000000ffff 231 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 232 233.Loop_neon: 234 vld1.64 $IN#hi,[$inp]! @ load inp 235 vld1.64 $IN#lo,[$inp]! 236#ifdef __ARMEL__ 237 vrev64.8 $IN,$IN 238#endif 239 veor $IN,$Xl @ inp^=Xi 240.Lgmult_neon: 241___ 242 &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo 243$code.=<<___; 244 veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing 245___ 246 &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) 247 &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi 248$code.=<<___; 249 veor $Xm,$Xm,$Xl @ Karatsuba post-processing 250 veor $Xm,$Xm,$Xh 251 veor $Xl#hi,$Xl#hi,$Xm#lo 252 veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result 253 254 @ equivalent of reduction_avx from ghash-x86_64.pl 255 vshl.i64 $t1,$Xl,#57 @ 1st phase 256 vshl.i64 $t2,$Xl,#62 257 veor $t2,$t2,$t1 @ 258 vshl.i64 $t1,$Xl,#63 259 veor $t2, $t2, $t1 @ 260 veor $Xl#hi,$Xl#hi,$t2#lo @ 261 veor $Xh#lo,$Xh#lo,$t2#hi 262 263 vshr.u64 $t2,$Xl,#1 @ 2nd phase 264 veor $Xh,$Xh,$Xl 265 veor $Xl,$Xl,$t2 @ 266 vshr.u64 $t2,$t2,#6 267 vshr.u64 $Xl,$Xl,#1 @ 268 veor $Xl,$Xl,$Xh @ 269 veor $Xl,$Xl,$t2 @ 270 271 subs $len,#16 272 bne .Loop_neon 273 274#ifdef __ARMEL__ 275 vrev64.8 $Xl,$Xl 276#endif 277 sub $Xi,#16 278 vst1.64 $Xl#hi,[$Xi]! @ write out Xi 279 vst1.64 $Xl#lo,[$Xi] 280 281 ret @ bx lr 282.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon 283#endif 284___ 285} 286$code.=<<___; 287.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 288.align 2 289___ 290 291foreach (split("\n",$code)) { 292 s/\`([^\`]*)\`/eval $1/geo; 293 294 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 295 s/\bret\b/bx lr/go or 296 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 297 298 print $_,"\n"; 299} 300close STDOUT or die "error closing STDOUT"; # enforce flush 301