1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17.text 18 19.globl _gcm_init_v8 20.private_extern _gcm_init_v8 21 22.align 4 23_gcm_init_v8: 24 AARCH64_VALID_CALL_TARGET 25 ld1 {v17.2d},[x1] //load input H 26 movi v19.16b,#0xe1 27 shl v19.2d,v19.2d,#57 //0xc2.0 28 ext v3.16b,v17.16b,v17.16b,#8 29 ushr v18.2d,v19.2d,#63 30 dup v17.4s,v17.s[1] 31 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 32 ushr v18.2d,v3.2d,#63 33 sshr v17.4s,v17.4s,#31 //broadcast carry bit 34 and v18.16b,v18.16b,v16.16b 35 shl v3.2d,v3.2d,#1 36 ext v18.16b,v18.16b,v18.16b,#8 37 and v16.16b,v16.16b,v17.16b 38 orr v3.16b,v3.16b,v18.16b //H<<<=1 39 eor v20.16b,v3.16b,v16.16b //twisted H 40 st1 {v20.2d},[x0],#16 //store Htable[0] 41 42 //calculate H^2 43 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 44 pmull v0.1q,v20.1d,v20.1d 45 eor v16.16b,v16.16b,v20.16b 46 pmull2 v2.1q,v20.2d,v20.2d 47 pmull v1.1q,v16.1d,v16.1d 48 49 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 50 eor v18.16b,v0.16b,v2.16b 51 eor v1.16b,v1.16b,v17.16b 52 eor v1.16b,v1.16b,v18.16b 53 pmull v18.1q,v0.1d,v19.1d //1st phase 54 55 ins v2.d[0],v1.d[1] 56 ins v1.d[1],v0.d[0] 57 eor v0.16b,v1.16b,v18.16b 58 59 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 60 pmull v0.1q,v0.1d,v19.1d 61 eor v18.16b,v18.16b,v2.16b 62 eor v22.16b,v0.16b,v18.16b 63 64 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 65 eor v17.16b,v17.16b,v22.16b 66 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 67 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 68 69 ret 70 71.globl _gcm_gmult_v8 72.private_extern _gcm_gmult_v8 73 74.align 4 75_gcm_gmult_v8: 76 AARCH64_VALID_CALL_TARGET 77 ld1 {v17.2d},[x0] //load Xi 78 movi v19.16b,#0xe1 79 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 80 shl v19.2d,v19.2d,#57 81#ifndef __ARMEB__ 82 rev64 v17.16b,v17.16b 83#endif 84 ext v3.16b,v17.16b,v17.16b,#8 85 86 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 87 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 88 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 89 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 90 91 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 92 eor v18.16b,v0.16b,v2.16b 93 eor v1.16b,v1.16b,v17.16b 94 eor v1.16b,v1.16b,v18.16b 95 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 96 97 ins v2.d[0],v1.d[1] 98 ins v1.d[1],v0.d[0] 99 eor v0.16b,v1.16b,v18.16b 100 101 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 102 pmull v0.1q,v0.1d,v19.1d 103 eor v18.16b,v18.16b,v2.16b 104 eor v0.16b,v0.16b,v18.16b 105 106#ifndef __ARMEB__ 107 rev64 v0.16b,v0.16b 108#endif 109 ext v0.16b,v0.16b,v0.16b,#8 110 st1 {v0.2d},[x0] //write out Xi 111 112 ret 113 114.globl _gcm_ghash_v8 115.private_extern _gcm_ghash_v8 116 117.align 4 118_gcm_ghash_v8: 119 AARCH64_VALID_CALL_TARGET 120 ld1 {v0.2d},[x0] //load [rotated] Xi 121 //"[rotated]" means that 122 //loaded value would have 123 //to be rotated in order to 124 //make it appear as in 125 //algorithm specification 126 subs x3,x3,#32 //see if x3 is 32 or larger 127 mov x12,#16 //x12 is used as post- 128 //increment for input pointer; 129 //as loop is modulo-scheduled 130 //x12 is zeroed just in time 131 //to preclude overstepping 132 //inp[len], which means that 133 //last block[s] are actually 134 //loaded twice, but last 135 //copy is not processed 136 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 137 movi v19.16b,#0xe1 138 ld1 {v22.2d},[x1] 139 csel x12,xzr,x12,eq //is it time to zero x12? 140 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 141 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 142 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 143#ifndef __ARMEB__ 144 rev64 v16.16b,v16.16b 145 rev64 v0.16b,v0.16b 146#endif 147 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 148 b.lo Lodd_tail_v8 //x3 was less than 32 149 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 150#ifndef __ARMEB__ 151 rev64 v17.16b,v17.16b 152#endif 153 ext v7.16b,v17.16b,v17.16b,#8 154 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 155 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 156 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 157 pmull2 v6.1q,v20.2d,v7.2d 158 b Loop_mod2x_v8 159 160.align 4 161Loop_mod2x_v8: 162 ext v18.16b,v3.16b,v3.16b,#8 163 subs x3,x3,#32 //is there more data? 164 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 165 csel x12,xzr,x12,lo //is it time to zero x12? 166 167 pmull v5.1q,v21.1d,v17.1d 168 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 169 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 170 eor v0.16b,v0.16b,v4.16b //accumulate 171 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 172 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 173 174 eor v2.16b,v2.16b,v6.16b 175 csel x12,xzr,x12,eq //is it time to zero x12? 176 eor v1.16b,v1.16b,v5.16b 177 178 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 179 eor v18.16b,v0.16b,v2.16b 180 eor v1.16b,v1.16b,v17.16b 181 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 182#ifndef __ARMEB__ 183 rev64 v16.16b,v16.16b 184#endif 185 eor v1.16b,v1.16b,v18.16b 186 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 187 188#ifndef __ARMEB__ 189 rev64 v17.16b,v17.16b 190#endif 191 ins v2.d[0],v1.d[1] 192 ins v1.d[1],v0.d[0] 193 ext v7.16b,v17.16b,v17.16b,#8 194 ext v3.16b,v16.16b,v16.16b,#8 195 eor v0.16b,v1.16b,v18.16b 196 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 197 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 198 199 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 200 pmull v0.1q,v0.1d,v19.1d 201 eor v3.16b,v3.16b,v18.16b 202 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 203 eor v3.16b,v3.16b,v0.16b 204 pmull2 v6.1q,v20.2d,v7.2d 205 b.hs Loop_mod2x_v8 //there was at least 32 more bytes 206 207 eor v2.16b,v2.16b,v18.16b 208 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 209 adds x3,x3,#32 //re-construct x3 210 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 211 b.eq Ldone_v8 //is x3 zero? 212Lodd_tail_v8: 213 ext v18.16b,v0.16b,v0.16b,#8 214 eor v3.16b,v3.16b,v0.16b //inp^=Xi 215 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 216 217 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 218 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 219 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 220 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 221 222 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 223 eor v18.16b,v0.16b,v2.16b 224 eor v1.16b,v1.16b,v17.16b 225 eor v1.16b,v1.16b,v18.16b 226 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 227 228 ins v2.d[0],v1.d[1] 229 ins v1.d[1],v0.d[0] 230 eor v0.16b,v1.16b,v18.16b 231 232 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 233 pmull v0.1q,v0.1d,v19.1d 234 eor v18.16b,v18.16b,v2.16b 235 eor v0.16b,v0.16b,v18.16b 236 237Ldone_v8: 238#ifndef __ARMEB__ 239 rev64 v0.16b,v0.16b 240#endif 241 ext v0.16b,v0.16b,v0.16b,#8 242 st1 {v0.2d},[x0] //write out Xi 243 244 ret 245 246.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 247.align 2 248.align 2 249#endif // !OPENSSL_NO_ASM 250