1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18#if __ARM_MAX_ARCH__>=7 19.text 20.arch armv8-a+crypto 21.globl gcm_init_v8 22 23.def gcm_init_v8 24 .type 32 25.endef 26.align 4 27gcm_init_v8: 28 AARCH64_VALID_CALL_TARGET 29 ld1 {v17.2d},[x1] //load input H 30 movi v19.16b,#0xe1 31 shl v19.2d,v19.2d,#57 //0xc2.0 32 ext v3.16b,v17.16b,v17.16b,#8 33 ushr v18.2d,v19.2d,#63 34 dup v17.4s,v17.s[1] 35 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 36 ushr v18.2d,v3.2d,#63 37 sshr v17.4s,v17.4s,#31 //broadcast carry bit 38 and v18.16b,v18.16b,v16.16b 39 shl v3.2d,v3.2d,#1 40 ext v18.16b,v18.16b,v18.16b,#8 41 and v16.16b,v16.16b,v17.16b 42 orr v3.16b,v3.16b,v18.16b //H<<<=1 43 eor v20.16b,v3.16b,v16.16b //twisted H 44 st1 {v20.2d},[x0],#16 //store Htable[0] 45 46 //calculate H^2 47 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 48 pmull v0.1q,v20.1d,v20.1d 49 eor v16.16b,v16.16b,v20.16b 50 pmull2 v2.1q,v20.2d,v20.2d 51 pmull v1.1q,v16.1d,v16.1d 52 53 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 54 eor v18.16b,v0.16b,v2.16b 55 eor v1.16b,v1.16b,v17.16b 56 eor v1.16b,v1.16b,v18.16b 57 pmull v18.1q,v0.1d,v19.1d //1st phase 58 59 ins v2.d[0],v1.d[1] 60 ins v1.d[1],v0.d[0] 61 eor v0.16b,v1.16b,v18.16b 62 63 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 64 pmull v0.1q,v0.1d,v19.1d 65 eor v18.16b,v18.16b,v2.16b 66 eor v22.16b,v0.16b,v18.16b 67 68 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 69 eor v17.16b,v17.16b,v22.16b 70 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 71 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 72 //calculate H^3 and H^4 73 pmull v0.1q,v20.1d, v22.1d 74 pmull v5.1q,v22.1d,v22.1d 75 pmull2 v2.1q,v20.2d, v22.2d 76 pmull2 v7.1q,v22.2d,v22.2d 77 pmull v1.1q,v16.1d,v17.1d 78 pmull v6.1q,v17.1d,v17.1d 79 80 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 81 ext v17.16b,v5.16b,v7.16b,#8 82 eor v18.16b,v0.16b,v2.16b 83 eor v1.16b,v1.16b,v16.16b 84 eor v4.16b,v5.16b,v7.16b 85 eor v6.16b,v6.16b,v17.16b 86 eor v1.16b,v1.16b,v18.16b 87 pmull v18.1q,v0.1d,v19.1d //1st phase 88 eor v6.16b,v6.16b,v4.16b 89 pmull v4.1q,v5.1d,v19.1d 90 91 ins v2.d[0],v1.d[1] 92 ins v7.d[0],v6.d[1] 93 ins v1.d[1],v0.d[0] 94 ins v6.d[1],v5.d[0] 95 eor v0.16b,v1.16b,v18.16b 96 eor v5.16b,v6.16b,v4.16b 97 98 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 99 ext v4.16b,v5.16b,v5.16b,#8 100 pmull v0.1q,v0.1d,v19.1d 101 pmull v5.1q,v5.1d,v19.1d 102 eor v18.16b,v18.16b,v2.16b 103 eor v4.16b,v4.16b,v7.16b 104 eor v20.16b, v0.16b,v18.16b //H^3 105 eor v22.16b,v5.16b,v4.16b //H^4 106 107 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 108 ext v17.16b,v22.16b,v22.16b,#8 109 eor v16.16b,v16.16b,v20.16b 110 eor v17.16b,v17.16b,v22.16b 111 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 112 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 113 ret 114 115.globl gcm_gmult_v8 116 117.def gcm_gmult_v8 118 .type 32 119.endef 120.align 4 121gcm_gmult_v8: 122 AARCH64_VALID_CALL_TARGET 123 ld1 {v17.2d},[x0] //load Xi 124 movi v19.16b,#0xe1 125 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 126 shl v19.2d,v19.2d,#57 127#ifndef __ARMEB__ 128 rev64 v17.16b,v17.16b 129#endif 130 ext v3.16b,v17.16b,v17.16b,#8 131 132 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 133 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 134 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 135 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 136 137 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 138 eor v18.16b,v0.16b,v2.16b 139 eor v1.16b,v1.16b,v17.16b 140 eor v1.16b,v1.16b,v18.16b 141 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 142 143 ins v2.d[0],v1.d[1] 144 ins v1.d[1],v0.d[0] 145 eor v0.16b,v1.16b,v18.16b 146 147 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 148 pmull v0.1q,v0.1d,v19.1d 149 eor v18.16b,v18.16b,v2.16b 150 eor v0.16b,v0.16b,v18.16b 151 152#ifndef __ARMEB__ 153 rev64 v0.16b,v0.16b 154#endif 155 ext v0.16b,v0.16b,v0.16b,#8 156 st1 {v0.2d},[x0] //write out Xi 157 158 ret 159 160.globl gcm_ghash_v8 161 162.def gcm_ghash_v8 163 .type 32 164.endef 165.align 4 166gcm_ghash_v8: 167 AARCH64_VALID_CALL_TARGET 168 cmp x3,#64 169 b.hs Lgcm_ghash_v8_4x 170 ld1 {v0.2d},[x0] //load [rotated] Xi 171 //"[rotated]" means that 172 //loaded value would have 173 //to be rotated in order to 174 //make it appear as in 175 //algorithm specification 176 subs x3,x3,#32 //see if x3 is 32 or larger 177 mov x12,#16 //x12 is used as post- 178 //increment for input pointer; 179 //as loop is modulo-scheduled 180 //x12 is zeroed just in time 181 //to preclude overstepping 182 //inp[len], which means that 183 //last block[s] are actually 184 //loaded twice, but last 185 //copy is not processed 186 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 187 movi v19.16b,#0xe1 188 ld1 {v22.2d},[x1] 189 csel x12,xzr,x12,eq //is it time to zero x12? 190 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 191 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 192 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 193#ifndef __ARMEB__ 194 rev64 v16.16b,v16.16b 195 rev64 v0.16b,v0.16b 196#endif 197 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 198 b.lo Lodd_tail_v8 //x3 was less than 32 199 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 200#ifndef __ARMEB__ 201 rev64 v17.16b,v17.16b 202#endif 203 ext v7.16b,v17.16b,v17.16b,#8 204 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 205 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 206 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 207 pmull2 v6.1q,v20.2d,v7.2d 208 b Loop_mod2x_v8 209 210.align 4 211Loop_mod2x_v8: 212 ext v18.16b,v3.16b,v3.16b,#8 213 subs x3,x3,#32 //is there more data? 214 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 215 csel x12,xzr,x12,lo //is it time to zero x12? 216 217 pmull v5.1q,v21.1d,v17.1d 218 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 219 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 220 eor v0.16b,v0.16b,v4.16b //accumulate 221 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 222 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 223 224 eor v2.16b,v2.16b,v6.16b 225 csel x12,xzr,x12,eq //is it time to zero x12? 226 eor v1.16b,v1.16b,v5.16b 227 228 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 229 eor v18.16b,v0.16b,v2.16b 230 eor v1.16b,v1.16b,v17.16b 231 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 232#ifndef __ARMEB__ 233 rev64 v16.16b,v16.16b 234#endif 235 eor v1.16b,v1.16b,v18.16b 236 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 237 238#ifndef __ARMEB__ 239 rev64 v17.16b,v17.16b 240#endif 241 ins v2.d[0],v1.d[1] 242 ins v1.d[1],v0.d[0] 243 ext v7.16b,v17.16b,v17.16b,#8 244 ext v3.16b,v16.16b,v16.16b,#8 245 eor v0.16b,v1.16b,v18.16b 246 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 247 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 248 249 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 250 pmull v0.1q,v0.1d,v19.1d 251 eor v3.16b,v3.16b,v18.16b 252 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 253 eor v3.16b,v3.16b,v0.16b 254 pmull2 v6.1q,v20.2d,v7.2d 255 b.hs Loop_mod2x_v8 //there was at least 32 more bytes 256 257 eor v2.16b,v2.16b,v18.16b 258 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 259 adds x3,x3,#32 //re-construct x3 260 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 261 b.eq Ldone_v8 //is x3 zero? 262Lodd_tail_v8: 263 ext v18.16b,v0.16b,v0.16b,#8 264 eor v3.16b,v3.16b,v0.16b //inp^=Xi 265 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 266 267 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 268 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 269 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 270 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 271 272 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 273 eor v18.16b,v0.16b,v2.16b 274 eor v1.16b,v1.16b,v17.16b 275 eor v1.16b,v1.16b,v18.16b 276 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 277 278 ins v2.d[0],v1.d[1] 279 ins v1.d[1],v0.d[0] 280 eor v0.16b,v1.16b,v18.16b 281 282 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 283 pmull v0.1q,v0.1d,v19.1d 284 eor v18.16b,v18.16b,v2.16b 285 eor v0.16b,v0.16b,v18.16b 286 287Ldone_v8: 288#ifndef __ARMEB__ 289 rev64 v0.16b,v0.16b 290#endif 291 ext v0.16b,v0.16b,v0.16b,#8 292 st1 {v0.2d},[x0] //write out Xi 293 294 ret 295 296.def gcm_ghash_v8_4x 297 .type 32 298.endef 299.align 4 300gcm_ghash_v8_4x: 301Lgcm_ghash_v8_4x: 302 ld1 {v0.2d},[x0] //load [rotated] Xi 303 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 304 movi v19.16b,#0xe1 305 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 306 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 307 308 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 309#ifndef __ARMEB__ 310 rev64 v0.16b,v0.16b 311 rev64 v5.16b,v5.16b 312 rev64 v6.16b,v6.16b 313 rev64 v7.16b,v7.16b 314 rev64 v4.16b,v4.16b 315#endif 316 ext v25.16b,v7.16b,v7.16b,#8 317 ext v24.16b,v6.16b,v6.16b,#8 318 ext v23.16b,v5.16b,v5.16b,#8 319 320 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 321 eor v7.16b,v7.16b,v25.16b 322 pmull2 v31.1q,v20.2d,v25.2d 323 pmull v30.1q,v21.1d,v7.1d 324 325 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 326 eor v6.16b,v6.16b,v24.16b 327 pmull2 v24.1q,v22.2d,v24.2d 328 pmull2 v6.1q,v21.2d,v6.2d 329 330 eor v29.16b,v29.16b,v16.16b 331 eor v31.16b,v31.16b,v24.16b 332 eor v30.16b,v30.16b,v6.16b 333 334 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 335 eor v5.16b,v5.16b,v23.16b 336 pmull2 v23.1q,v26.2d,v23.2d 337 pmull v5.1q,v27.1d,v5.1d 338 339 eor v29.16b,v29.16b,v7.16b 340 eor v31.16b,v31.16b,v23.16b 341 eor v30.16b,v30.16b,v5.16b 342 343 subs x3,x3,#128 344 b.lo Ltail4x 345 346 b Loop4x 347 348.align 4 349Loop4x: 350 eor v16.16b,v4.16b,v0.16b 351 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 352 ext v3.16b,v16.16b,v16.16b,#8 353#ifndef __ARMEB__ 354 rev64 v5.16b,v5.16b 355 rev64 v6.16b,v6.16b 356 rev64 v7.16b,v7.16b 357 rev64 v4.16b,v4.16b 358#endif 359 360 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 361 eor v16.16b,v16.16b,v3.16b 362 pmull2 v2.1q,v28.2d,v3.2d 363 ext v25.16b,v7.16b,v7.16b,#8 364 pmull2 v1.1q,v27.2d,v16.2d 365 366 eor v0.16b,v0.16b,v29.16b 367 eor v2.16b,v2.16b,v31.16b 368 ext v24.16b,v6.16b,v6.16b,#8 369 eor v1.16b,v1.16b,v30.16b 370 ext v23.16b,v5.16b,v5.16b,#8 371 372 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 373 eor v18.16b,v0.16b,v2.16b 374 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 375 eor v7.16b,v7.16b,v25.16b 376 eor v1.16b,v1.16b,v17.16b 377 pmull2 v31.1q,v20.2d,v25.2d 378 eor v1.16b,v1.16b,v18.16b 379 pmull v30.1q,v21.1d,v7.1d 380 381 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 382 ins v2.d[0],v1.d[1] 383 ins v1.d[1],v0.d[0] 384 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 385 eor v6.16b,v6.16b,v24.16b 386 pmull2 v24.1q,v22.2d,v24.2d 387 eor v0.16b,v1.16b,v18.16b 388 pmull2 v6.1q,v21.2d,v6.2d 389 390 eor v29.16b,v29.16b,v16.16b 391 eor v31.16b,v31.16b,v24.16b 392 eor v30.16b,v30.16b,v6.16b 393 394 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 395 pmull v0.1q,v0.1d,v19.1d 396 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 397 eor v5.16b,v5.16b,v23.16b 398 eor v18.16b,v18.16b,v2.16b 399 pmull2 v23.1q,v26.2d,v23.2d 400 pmull v5.1q,v27.1d,v5.1d 401 402 eor v0.16b,v0.16b,v18.16b 403 eor v29.16b,v29.16b,v7.16b 404 eor v31.16b,v31.16b,v23.16b 405 ext v0.16b,v0.16b,v0.16b,#8 406 eor v30.16b,v30.16b,v5.16b 407 408 subs x3,x3,#64 409 b.hs Loop4x 410 411Ltail4x: 412 eor v16.16b,v4.16b,v0.16b 413 ext v3.16b,v16.16b,v16.16b,#8 414 415 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 416 eor v16.16b,v16.16b,v3.16b 417 pmull2 v2.1q,v28.2d,v3.2d 418 pmull2 v1.1q,v27.2d,v16.2d 419 420 eor v0.16b,v0.16b,v29.16b 421 eor v2.16b,v2.16b,v31.16b 422 eor v1.16b,v1.16b,v30.16b 423 424 adds x3,x3,#64 425 b.eq Ldone4x 426 427 cmp x3,#32 428 b.lo Lone 429 b.eq Ltwo 430Lthree: 431 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 432 eor v18.16b,v0.16b,v2.16b 433 eor v1.16b,v1.16b,v17.16b 434 ld1 {v4.2d,v5.2d,v6.2d},[x2] 435 eor v1.16b,v1.16b,v18.16b 436#ifndef __ARMEB__ 437 rev64 v5.16b,v5.16b 438 rev64 v6.16b,v6.16b 439 rev64 v4.16b,v4.16b 440#endif 441 442 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 443 ins v2.d[0],v1.d[1] 444 ins v1.d[1],v0.d[0] 445 ext v24.16b,v6.16b,v6.16b,#8 446 ext v23.16b,v5.16b,v5.16b,#8 447 eor v0.16b,v1.16b,v18.16b 448 449 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 450 eor v6.16b,v6.16b,v24.16b 451 452 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 453 pmull v0.1q,v0.1d,v19.1d 454 eor v18.16b,v18.16b,v2.16b 455 pmull2 v31.1q,v20.2d,v24.2d 456 pmull v30.1q,v21.1d,v6.1d 457 eor v0.16b,v0.16b,v18.16b 458 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 459 eor v5.16b,v5.16b,v23.16b 460 ext v0.16b,v0.16b,v0.16b,#8 461 462 pmull2 v23.1q,v22.2d,v23.2d 463 eor v16.16b,v4.16b,v0.16b 464 pmull2 v5.1q,v21.2d,v5.2d 465 ext v3.16b,v16.16b,v16.16b,#8 466 467 eor v29.16b,v29.16b,v7.16b 468 eor v31.16b,v31.16b,v23.16b 469 eor v30.16b,v30.16b,v5.16b 470 471 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 472 eor v16.16b,v16.16b,v3.16b 473 pmull2 v2.1q,v26.2d,v3.2d 474 pmull v1.1q,v27.1d,v16.1d 475 476 eor v0.16b,v0.16b,v29.16b 477 eor v2.16b,v2.16b,v31.16b 478 eor v1.16b,v1.16b,v30.16b 479 b Ldone4x 480 481.align 4 482Ltwo: 483 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 484 eor v18.16b,v0.16b,v2.16b 485 eor v1.16b,v1.16b,v17.16b 486 ld1 {v4.2d,v5.2d},[x2] 487 eor v1.16b,v1.16b,v18.16b 488#ifndef __ARMEB__ 489 rev64 v5.16b,v5.16b 490 rev64 v4.16b,v4.16b 491#endif 492 493 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 494 ins v2.d[0],v1.d[1] 495 ins v1.d[1],v0.d[0] 496 ext v23.16b,v5.16b,v5.16b,#8 497 eor v0.16b,v1.16b,v18.16b 498 499 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 500 pmull v0.1q,v0.1d,v19.1d 501 eor v18.16b,v18.16b,v2.16b 502 eor v0.16b,v0.16b,v18.16b 503 ext v0.16b,v0.16b,v0.16b,#8 504 505 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 506 eor v5.16b,v5.16b,v23.16b 507 508 eor v16.16b,v4.16b,v0.16b 509 ext v3.16b,v16.16b,v16.16b,#8 510 511 pmull2 v31.1q,v20.2d,v23.2d 512 pmull v30.1q,v21.1d,v5.1d 513 514 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 515 eor v16.16b,v16.16b,v3.16b 516 pmull2 v2.1q,v22.2d,v3.2d 517 pmull2 v1.1q,v21.2d,v16.2d 518 519 eor v0.16b,v0.16b,v29.16b 520 eor v2.16b,v2.16b,v31.16b 521 eor v1.16b,v1.16b,v30.16b 522 b Ldone4x 523 524.align 4 525Lone: 526 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 527 eor v18.16b,v0.16b,v2.16b 528 eor v1.16b,v1.16b,v17.16b 529 ld1 {v4.2d},[x2] 530 eor v1.16b,v1.16b,v18.16b 531#ifndef __ARMEB__ 532 rev64 v4.16b,v4.16b 533#endif 534 535 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 536 ins v2.d[0],v1.d[1] 537 ins v1.d[1],v0.d[0] 538 eor v0.16b,v1.16b,v18.16b 539 540 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 541 pmull v0.1q,v0.1d,v19.1d 542 eor v18.16b,v18.16b,v2.16b 543 eor v0.16b,v0.16b,v18.16b 544 ext v0.16b,v0.16b,v0.16b,#8 545 546 eor v16.16b,v4.16b,v0.16b 547 ext v3.16b,v16.16b,v16.16b,#8 548 549 pmull v0.1q,v20.1d,v3.1d 550 eor v16.16b,v16.16b,v3.16b 551 pmull2 v2.1q,v20.2d,v3.2d 552 pmull v1.1q,v21.1d,v16.1d 553 554Ldone4x: 555 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 556 eor v18.16b,v0.16b,v2.16b 557 eor v1.16b,v1.16b,v17.16b 558 eor v1.16b,v1.16b,v18.16b 559 560 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 561 ins v2.d[0],v1.d[1] 562 ins v1.d[1],v0.d[0] 563 eor v0.16b,v1.16b,v18.16b 564 565 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 566 pmull v0.1q,v0.1d,v19.1d 567 eor v18.16b,v18.16b,v2.16b 568 eor v0.16b,v0.16b,v18.16b 569 ext v0.16b,v0.16b,v0.16b,#8 570 571#ifndef __ARMEB__ 572 rev64 v0.16b,v0.16b 573#endif 574 st1 {v0.2d},[x0] //write out Xi 575 576 ret 577 578.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 579.align 2 580.align 2 581#endif 582#endif 583#endif // !OPENSSL_NO_ASM 584