1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18#if __ARM_MAX_ARCH__>=7 19.text 20.arch armv8-a+crypto 21.globl gcm_init_v8 22.hidden gcm_init_v8 23.type gcm_init_v8,%function 24.align 4 25gcm_init_v8: 26 AARCH64_VALID_CALL_TARGET 27 ld1 {v17.2d},[x1] //load input H 28 movi v19.16b,#0xe1 29 shl v19.2d,v19.2d,#57 //0xc2.0 30 ext v3.16b,v17.16b,v17.16b,#8 31 ushr v18.2d,v19.2d,#63 32 dup v17.4s,v17.s[1] 33 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 34 ushr v18.2d,v3.2d,#63 35 sshr v17.4s,v17.4s,#31 //broadcast carry bit 36 and v18.16b,v18.16b,v16.16b 37 shl v3.2d,v3.2d,#1 38 ext v18.16b,v18.16b,v18.16b,#8 39 and v16.16b,v16.16b,v17.16b 40 orr v3.16b,v3.16b,v18.16b //H<<<=1 41 eor v20.16b,v3.16b,v16.16b //twisted H 42 st1 {v20.2d},[x0],#16 //store Htable[0] 43 44 //calculate H^2 45 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 46 pmull v0.1q,v20.1d,v20.1d 47 eor v16.16b,v16.16b,v20.16b 48 pmull2 v2.1q,v20.2d,v20.2d 49 pmull v1.1q,v16.1d,v16.1d 50 51 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 52 eor v18.16b,v0.16b,v2.16b 53 eor v1.16b,v1.16b,v17.16b 54 eor v1.16b,v1.16b,v18.16b 55 pmull v18.1q,v0.1d,v19.1d //1st phase 56 57 ins v2.d[0],v1.d[1] 58 ins v1.d[1],v0.d[0] 59 eor v0.16b,v1.16b,v18.16b 60 61 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 62 pmull v0.1q,v0.1d,v19.1d 63 eor v18.16b,v18.16b,v2.16b 64 eor v22.16b,v0.16b,v18.16b 65 66 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 67 eor v17.16b,v17.16b,v22.16b 68 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 69 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 70 //calculate H^3 and H^4 71 pmull v0.1q,v20.1d, v22.1d 72 pmull v5.1q,v22.1d,v22.1d 73 pmull2 v2.1q,v20.2d, v22.2d 74 pmull2 v7.1q,v22.2d,v22.2d 75 pmull v1.1q,v16.1d,v17.1d 76 pmull v6.1q,v17.1d,v17.1d 77 78 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 79 ext v17.16b,v5.16b,v7.16b,#8 80 eor v18.16b,v0.16b,v2.16b 81 eor v1.16b,v1.16b,v16.16b 82 eor v4.16b,v5.16b,v7.16b 83 eor v6.16b,v6.16b,v17.16b 84 eor v1.16b,v1.16b,v18.16b 85 pmull v18.1q,v0.1d,v19.1d //1st phase 86 eor v6.16b,v6.16b,v4.16b 87 pmull v4.1q,v5.1d,v19.1d 88 89 ins v2.d[0],v1.d[1] 90 ins v7.d[0],v6.d[1] 91 ins v1.d[1],v0.d[0] 92 ins v6.d[1],v5.d[0] 93 eor v0.16b,v1.16b,v18.16b 94 eor v5.16b,v6.16b,v4.16b 95 96 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 97 ext v4.16b,v5.16b,v5.16b,#8 98 pmull v0.1q,v0.1d,v19.1d 99 pmull v5.1q,v5.1d,v19.1d 100 eor v18.16b,v18.16b,v2.16b 101 eor v4.16b,v4.16b,v7.16b 102 eor v20.16b, v0.16b,v18.16b //H^3 103 eor v22.16b,v5.16b,v4.16b //H^4 104 105 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 106 ext v17.16b,v22.16b,v22.16b,#8 107 eor v16.16b,v16.16b,v20.16b 108 eor v17.16b,v17.16b,v22.16b 109 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 110 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 111 ret 112.size gcm_init_v8,.-gcm_init_v8 113.globl gcm_gmult_v8 114.hidden gcm_gmult_v8 115.type gcm_gmult_v8,%function 116.align 4 117gcm_gmult_v8: 118 AARCH64_VALID_CALL_TARGET 119 ld1 {v17.2d},[x0] //load Xi 120 movi v19.16b,#0xe1 121 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 122 shl v19.2d,v19.2d,#57 123#ifndef __ARMEB__ 124 rev64 v17.16b,v17.16b 125#endif 126 ext v3.16b,v17.16b,v17.16b,#8 127 128 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 129 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 130 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 131 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 132 133 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 134 eor v18.16b,v0.16b,v2.16b 135 eor v1.16b,v1.16b,v17.16b 136 eor v1.16b,v1.16b,v18.16b 137 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 138 139 ins v2.d[0],v1.d[1] 140 ins v1.d[1],v0.d[0] 141 eor v0.16b,v1.16b,v18.16b 142 143 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 144 pmull v0.1q,v0.1d,v19.1d 145 eor v18.16b,v18.16b,v2.16b 146 eor v0.16b,v0.16b,v18.16b 147 148#ifndef __ARMEB__ 149 rev64 v0.16b,v0.16b 150#endif 151 ext v0.16b,v0.16b,v0.16b,#8 152 st1 {v0.2d},[x0] //write out Xi 153 154 ret 155.size gcm_gmult_v8,.-gcm_gmult_v8 156.globl gcm_ghash_v8 157.hidden gcm_ghash_v8 158.type gcm_ghash_v8,%function 159.align 4 160gcm_ghash_v8: 161 AARCH64_VALID_CALL_TARGET 162 cmp x3,#64 163 b.hs .Lgcm_ghash_v8_4x 164 ld1 {v0.2d},[x0] //load [rotated] Xi 165 //"[rotated]" means that 166 //loaded value would have 167 //to be rotated in order to 168 //make it appear as in 169 //algorithm specification 170 subs x3,x3,#32 //see if x3 is 32 or larger 171 mov x12,#16 //x12 is used as post- 172 //increment for input pointer; 173 //as loop is modulo-scheduled 174 //x12 is zeroed just in time 175 //to preclude overstepping 176 //inp[len], which means that 177 //last block[s] are actually 178 //loaded twice, but last 179 //copy is not processed 180 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 181 movi v19.16b,#0xe1 182 ld1 {v22.2d},[x1] 183 csel x12,xzr,x12,eq //is it time to zero x12? 184 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 185 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 186 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 187#ifndef __ARMEB__ 188 rev64 v16.16b,v16.16b 189 rev64 v0.16b,v0.16b 190#endif 191 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 192 b.lo .Lodd_tail_v8 //x3 was less than 32 193 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 194#ifndef __ARMEB__ 195 rev64 v17.16b,v17.16b 196#endif 197 ext v7.16b,v17.16b,v17.16b,#8 198 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 199 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 200 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 201 pmull2 v6.1q,v20.2d,v7.2d 202 b .Loop_mod2x_v8 203 204.align 4 205.Loop_mod2x_v8: 206 ext v18.16b,v3.16b,v3.16b,#8 207 subs x3,x3,#32 //is there more data? 208 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 209 csel x12,xzr,x12,lo //is it time to zero x12? 210 211 pmull v5.1q,v21.1d,v17.1d 212 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 213 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 214 eor v0.16b,v0.16b,v4.16b //accumulate 215 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 216 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 217 218 eor v2.16b,v2.16b,v6.16b 219 csel x12,xzr,x12,eq //is it time to zero x12? 220 eor v1.16b,v1.16b,v5.16b 221 222 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 223 eor v18.16b,v0.16b,v2.16b 224 eor v1.16b,v1.16b,v17.16b 225 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 226#ifndef __ARMEB__ 227 rev64 v16.16b,v16.16b 228#endif 229 eor v1.16b,v1.16b,v18.16b 230 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 231 232#ifndef __ARMEB__ 233 rev64 v17.16b,v17.16b 234#endif 235 ins v2.d[0],v1.d[1] 236 ins v1.d[1],v0.d[0] 237 ext v7.16b,v17.16b,v17.16b,#8 238 ext v3.16b,v16.16b,v16.16b,#8 239 eor v0.16b,v1.16b,v18.16b 240 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 241 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 242 243 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 244 pmull v0.1q,v0.1d,v19.1d 245 eor v3.16b,v3.16b,v18.16b 246 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 247 eor v3.16b,v3.16b,v0.16b 248 pmull2 v6.1q,v20.2d,v7.2d 249 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 250 251 eor v2.16b,v2.16b,v18.16b 252 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 253 adds x3,x3,#32 //re-construct x3 254 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 255 b.eq .Ldone_v8 //is x3 zero? 256.Lodd_tail_v8: 257 ext v18.16b,v0.16b,v0.16b,#8 258 eor v3.16b,v3.16b,v0.16b //inp^=Xi 259 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 260 261 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 262 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 263 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 264 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 265 266 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 267 eor v18.16b,v0.16b,v2.16b 268 eor v1.16b,v1.16b,v17.16b 269 eor v1.16b,v1.16b,v18.16b 270 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 271 272 ins v2.d[0],v1.d[1] 273 ins v1.d[1],v0.d[0] 274 eor v0.16b,v1.16b,v18.16b 275 276 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 277 pmull v0.1q,v0.1d,v19.1d 278 eor v18.16b,v18.16b,v2.16b 279 eor v0.16b,v0.16b,v18.16b 280 281.Ldone_v8: 282#ifndef __ARMEB__ 283 rev64 v0.16b,v0.16b 284#endif 285 ext v0.16b,v0.16b,v0.16b,#8 286 st1 {v0.2d},[x0] //write out Xi 287 288 ret 289.size gcm_ghash_v8,.-gcm_ghash_v8 290.type gcm_ghash_v8_4x,%function 291.align 4 292gcm_ghash_v8_4x: 293.Lgcm_ghash_v8_4x: 294 ld1 {v0.2d},[x0] //load [rotated] Xi 295 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 296 movi v19.16b,#0xe1 297 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 298 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 299 300 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 301#ifndef __ARMEB__ 302 rev64 v0.16b,v0.16b 303 rev64 v5.16b,v5.16b 304 rev64 v6.16b,v6.16b 305 rev64 v7.16b,v7.16b 306 rev64 v4.16b,v4.16b 307#endif 308 ext v25.16b,v7.16b,v7.16b,#8 309 ext v24.16b,v6.16b,v6.16b,#8 310 ext v23.16b,v5.16b,v5.16b,#8 311 312 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 313 eor v7.16b,v7.16b,v25.16b 314 pmull2 v31.1q,v20.2d,v25.2d 315 pmull v30.1q,v21.1d,v7.1d 316 317 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 318 eor v6.16b,v6.16b,v24.16b 319 pmull2 v24.1q,v22.2d,v24.2d 320 pmull2 v6.1q,v21.2d,v6.2d 321 322 eor v29.16b,v29.16b,v16.16b 323 eor v31.16b,v31.16b,v24.16b 324 eor v30.16b,v30.16b,v6.16b 325 326 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 327 eor v5.16b,v5.16b,v23.16b 328 pmull2 v23.1q,v26.2d,v23.2d 329 pmull v5.1q,v27.1d,v5.1d 330 331 eor v29.16b,v29.16b,v7.16b 332 eor v31.16b,v31.16b,v23.16b 333 eor v30.16b,v30.16b,v5.16b 334 335 subs x3,x3,#128 336 b.lo .Ltail4x 337 338 b .Loop4x 339 340.align 4 341.Loop4x: 342 eor v16.16b,v4.16b,v0.16b 343 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 344 ext v3.16b,v16.16b,v16.16b,#8 345#ifndef __ARMEB__ 346 rev64 v5.16b,v5.16b 347 rev64 v6.16b,v6.16b 348 rev64 v7.16b,v7.16b 349 rev64 v4.16b,v4.16b 350#endif 351 352 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 353 eor v16.16b,v16.16b,v3.16b 354 pmull2 v2.1q,v28.2d,v3.2d 355 ext v25.16b,v7.16b,v7.16b,#8 356 pmull2 v1.1q,v27.2d,v16.2d 357 358 eor v0.16b,v0.16b,v29.16b 359 eor v2.16b,v2.16b,v31.16b 360 ext v24.16b,v6.16b,v6.16b,#8 361 eor v1.16b,v1.16b,v30.16b 362 ext v23.16b,v5.16b,v5.16b,#8 363 364 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 365 eor v18.16b,v0.16b,v2.16b 366 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 367 eor v7.16b,v7.16b,v25.16b 368 eor v1.16b,v1.16b,v17.16b 369 pmull2 v31.1q,v20.2d,v25.2d 370 eor v1.16b,v1.16b,v18.16b 371 pmull v30.1q,v21.1d,v7.1d 372 373 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 374 ins v2.d[0],v1.d[1] 375 ins v1.d[1],v0.d[0] 376 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 377 eor v6.16b,v6.16b,v24.16b 378 pmull2 v24.1q,v22.2d,v24.2d 379 eor v0.16b,v1.16b,v18.16b 380 pmull2 v6.1q,v21.2d,v6.2d 381 382 eor v29.16b,v29.16b,v16.16b 383 eor v31.16b,v31.16b,v24.16b 384 eor v30.16b,v30.16b,v6.16b 385 386 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 387 pmull v0.1q,v0.1d,v19.1d 388 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 389 eor v5.16b,v5.16b,v23.16b 390 eor v18.16b,v18.16b,v2.16b 391 pmull2 v23.1q,v26.2d,v23.2d 392 pmull v5.1q,v27.1d,v5.1d 393 394 eor v0.16b,v0.16b,v18.16b 395 eor v29.16b,v29.16b,v7.16b 396 eor v31.16b,v31.16b,v23.16b 397 ext v0.16b,v0.16b,v0.16b,#8 398 eor v30.16b,v30.16b,v5.16b 399 400 subs x3,x3,#64 401 b.hs .Loop4x 402 403.Ltail4x: 404 eor v16.16b,v4.16b,v0.16b 405 ext v3.16b,v16.16b,v16.16b,#8 406 407 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 408 eor v16.16b,v16.16b,v3.16b 409 pmull2 v2.1q,v28.2d,v3.2d 410 pmull2 v1.1q,v27.2d,v16.2d 411 412 eor v0.16b,v0.16b,v29.16b 413 eor v2.16b,v2.16b,v31.16b 414 eor v1.16b,v1.16b,v30.16b 415 416 adds x3,x3,#64 417 b.eq .Ldone4x 418 419 cmp x3,#32 420 b.lo .Lone 421 b.eq .Ltwo 422.Lthree: 423 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 424 eor v18.16b,v0.16b,v2.16b 425 eor v1.16b,v1.16b,v17.16b 426 ld1 {v4.2d,v5.2d,v6.2d},[x2] 427 eor v1.16b,v1.16b,v18.16b 428#ifndef __ARMEB__ 429 rev64 v5.16b,v5.16b 430 rev64 v6.16b,v6.16b 431 rev64 v4.16b,v4.16b 432#endif 433 434 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 435 ins v2.d[0],v1.d[1] 436 ins v1.d[1],v0.d[0] 437 ext v24.16b,v6.16b,v6.16b,#8 438 ext v23.16b,v5.16b,v5.16b,#8 439 eor v0.16b,v1.16b,v18.16b 440 441 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 442 eor v6.16b,v6.16b,v24.16b 443 444 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 445 pmull v0.1q,v0.1d,v19.1d 446 eor v18.16b,v18.16b,v2.16b 447 pmull2 v31.1q,v20.2d,v24.2d 448 pmull v30.1q,v21.1d,v6.1d 449 eor v0.16b,v0.16b,v18.16b 450 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 451 eor v5.16b,v5.16b,v23.16b 452 ext v0.16b,v0.16b,v0.16b,#8 453 454 pmull2 v23.1q,v22.2d,v23.2d 455 eor v16.16b,v4.16b,v0.16b 456 pmull2 v5.1q,v21.2d,v5.2d 457 ext v3.16b,v16.16b,v16.16b,#8 458 459 eor v29.16b,v29.16b,v7.16b 460 eor v31.16b,v31.16b,v23.16b 461 eor v30.16b,v30.16b,v5.16b 462 463 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 464 eor v16.16b,v16.16b,v3.16b 465 pmull2 v2.1q,v26.2d,v3.2d 466 pmull v1.1q,v27.1d,v16.1d 467 468 eor v0.16b,v0.16b,v29.16b 469 eor v2.16b,v2.16b,v31.16b 470 eor v1.16b,v1.16b,v30.16b 471 b .Ldone4x 472 473.align 4 474.Ltwo: 475 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 476 eor v18.16b,v0.16b,v2.16b 477 eor v1.16b,v1.16b,v17.16b 478 ld1 {v4.2d,v5.2d},[x2] 479 eor v1.16b,v1.16b,v18.16b 480#ifndef __ARMEB__ 481 rev64 v5.16b,v5.16b 482 rev64 v4.16b,v4.16b 483#endif 484 485 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 486 ins v2.d[0],v1.d[1] 487 ins v1.d[1],v0.d[0] 488 ext v23.16b,v5.16b,v5.16b,#8 489 eor v0.16b,v1.16b,v18.16b 490 491 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 492 pmull v0.1q,v0.1d,v19.1d 493 eor v18.16b,v18.16b,v2.16b 494 eor v0.16b,v0.16b,v18.16b 495 ext v0.16b,v0.16b,v0.16b,#8 496 497 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 498 eor v5.16b,v5.16b,v23.16b 499 500 eor v16.16b,v4.16b,v0.16b 501 ext v3.16b,v16.16b,v16.16b,#8 502 503 pmull2 v31.1q,v20.2d,v23.2d 504 pmull v30.1q,v21.1d,v5.1d 505 506 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 507 eor v16.16b,v16.16b,v3.16b 508 pmull2 v2.1q,v22.2d,v3.2d 509 pmull2 v1.1q,v21.2d,v16.2d 510 511 eor v0.16b,v0.16b,v29.16b 512 eor v2.16b,v2.16b,v31.16b 513 eor v1.16b,v1.16b,v30.16b 514 b .Ldone4x 515 516.align 4 517.Lone: 518 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 519 eor v18.16b,v0.16b,v2.16b 520 eor v1.16b,v1.16b,v17.16b 521 ld1 {v4.2d},[x2] 522 eor v1.16b,v1.16b,v18.16b 523#ifndef __ARMEB__ 524 rev64 v4.16b,v4.16b 525#endif 526 527 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 528 ins v2.d[0],v1.d[1] 529 ins v1.d[1],v0.d[0] 530 eor v0.16b,v1.16b,v18.16b 531 532 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 533 pmull v0.1q,v0.1d,v19.1d 534 eor v18.16b,v18.16b,v2.16b 535 eor v0.16b,v0.16b,v18.16b 536 ext v0.16b,v0.16b,v0.16b,#8 537 538 eor v16.16b,v4.16b,v0.16b 539 ext v3.16b,v16.16b,v16.16b,#8 540 541 pmull v0.1q,v20.1d,v3.1d 542 eor v16.16b,v16.16b,v3.16b 543 pmull2 v2.1q,v20.2d,v3.2d 544 pmull v1.1q,v21.1d,v16.1d 545 546.Ldone4x: 547 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 548 eor v18.16b,v0.16b,v2.16b 549 eor v1.16b,v1.16b,v17.16b 550 eor v1.16b,v1.16b,v18.16b 551 552 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 553 ins v2.d[0],v1.d[1] 554 ins v1.d[1],v0.d[0] 555 eor v0.16b,v1.16b,v18.16b 556 557 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 558 pmull v0.1q,v0.1d,v19.1d 559 eor v18.16b,v18.16b,v2.16b 560 eor v0.16b,v0.16b,v18.16b 561 ext v0.16b,v0.16b,v0.16b,#8 562 563#ifndef __ARMEB__ 564 rev64 v0.16b,v0.16b 565#endif 566 st1 {v0.2d},[x0] //write out Xi 567 568 ret 569.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 570.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 571.align 2 572.align 2 573#endif 574#endif 575#endif // !OPENSSL_NO_ASM 576.section .note.GNU-stack,"",%progbits 577