1#include "arm_arch.h" 2 3#if __ARM_MAX_ARCH__>=7 4 5.text 6.globl _gcm_init_v8 7 8.align 4 9_gcm_init_v8: 10 ld1 {v17.2d},[x1] //load input H 11 movi v19.16b,#0xe1 12 shl v19.2d,v19.2d,#57 //0xc2.0 13 ext v3.16b,v17.16b,v17.16b,#8 14 ushr v18.2d,v19.2d,#63 15 dup v17.4s,v17.s[1] 16 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 17 ushr v18.2d,v3.2d,#63 18 sshr v17.4s,v17.4s,#31 //broadcast carry bit 19 and v18.16b,v18.16b,v16.16b 20 shl v3.2d,v3.2d,#1 21 ext v18.16b,v18.16b,v18.16b,#8 22 and v16.16b,v16.16b,v17.16b 23 orr v3.16b,v3.16b,v18.16b //H<<<=1 24 eor v20.16b,v3.16b,v16.16b //twisted H 25 st1 {v20.2d},[x0],#16 //store Htable[0] 26 27 //calculate H^2 28 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 29 pmull v0.1q,v20.1d,v20.1d 30 eor v16.16b,v16.16b,v20.16b 31 pmull2 v2.1q,v20.2d,v20.2d 32 pmull v1.1q,v16.1d,v16.1d 33 34 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 35 eor v18.16b,v0.16b,v2.16b 36 eor v1.16b,v1.16b,v17.16b 37 eor v1.16b,v1.16b,v18.16b 38 pmull v18.1q,v0.1d,v19.1d //1st phase 39 40 ins v2.d[0],v1.d[1] 41 ins v1.d[1],v0.d[0] 42 eor v0.16b,v1.16b,v18.16b 43 44 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 45 pmull v0.1q,v0.1d,v19.1d 46 eor v18.16b,v18.16b,v2.16b 47 eor v22.16b,v0.16b,v18.16b 48 49 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 50 eor v17.16b,v17.16b,v22.16b 51 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 52 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 53 //calculate H^3 and H^4 54 pmull v0.1q,v20.1d, v22.1d 55 pmull v5.1q,v22.1d,v22.1d 56 pmull2 v2.1q,v20.2d, v22.2d 57 pmull2 v7.1q,v22.2d,v22.2d 58 pmull v1.1q,v16.1d,v17.1d 59 pmull v6.1q,v17.1d,v17.1d 60 61 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 62 ext v17.16b,v5.16b,v7.16b,#8 63 eor v18.16b,v0.16b,v2.16b 64 eor v1.16b,v1.16b,v16.16b 65 eor v4.16b,v5.16b,v7.16b 66 eor v6.16b,v6.16b,v17.16b 67 eor v1.16b,v1.16b,v18.16b 68 pmull v18.1q,v0.1d,v19.1d //1st phase 69 eor v6.16b,v6.16b,v4.16b 70 pmull v4.1q,v5.1d,v19.1d 71 72 ins v2.d[0],v1.d[1] 73 ins v7.d[0],v6.d[1] 74 ins v1.d[1],v0.d[0] 75 ins v6.d[1],v5.d[0] 76 eor v0.16b,v1.16b,v18.16b 77 eor v5.16b,v6.16b,v4.16b 78 79 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 80 ext v4.16b,v5.16b,v5.16b,#8 81 pmull v0.1q,v0.1d,v19.1d 82 pmull v5.1q,v5.1d,v19.1d 83 eor v18.16b,v18.16b,v2.16b 84 eor v4.16b,v4.16b,v7.16b 85 eor v20.16b, v0.16b,v18.16b //H^3 86 eor v22.16b,v5.16b,v4.16b //H^4 87 88 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 89 ext v17.16b,v22.16b,v22.16b,#8 90 eor v16.16b,v16.16b,v20.16b 91 eor v17.16b,v17.16b,v22.16b 92 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 93 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 94 ret 95 96.globl _gcm_gmult_v8 97 98.align 4 99_gcm_gmult_v8: 100 ld1 {v17.2d},[x0] //load Xi 101 movi v19.16b,#0xe1 102 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 103 shl v19.2d,v19.2d,#57 104#ifndef __AARCH64EB__ 105 rev64 v17.16b,v17.16b 106#endif 107 ext v3.16b,v17.16b,v17.16b,#8 108 109 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 110 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 111 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 112 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 113 114 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 115 eor v18.16b,v0.16b,v2.16b 116 eor v1.16b,v1.16b,v17.16b 117 eor v1.16b,v1.16b,v18.16b 118 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 119 120 ins v2.d[0],v1.d[1] 121 ins v1.d[1],v0.d[0] 122 eor v0.16b,v1.16b,v18.16b 123 124 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 125 pmull v0.1q,v0.1d,v19.1d 126 eor v18.16b,v18.16b,v2.16b 127 eor v0.16b,v0.16b,v18.16b 128 129#ifndef __AARCH64EB__ 130 rev64 v0.16b,v0.16b 131#endif 132 ext v0.16b,v0.16b,v0.16b,#8 133 st1 {v0.2d},[x0] //write out Xi 134 135 ret 136 137.globl _gcm_ghash_v8 138 139.align 4 140_gcm_ghash_v8: 141 cmp x3,#64 142 b.hs Lgcm_ghash_v8_4x 143 ld1 {v0.2d},[x0] //load [rotated] Xi 144 //"[rotated]" means that 145 //loaded value would have 146 //to be rotated in order to 147 //make it appear as in 148 //algorithm specification 149 subs x3,x3,#32 //see if x3 is 32 or larger 150 mov x12,#16 //x12 is used as post- 151 //increment for input pointer; 152 //as loop is modulo-scheduled 153 //x12 is zeroed just in time 154 //to preclude overstepping 155 //inp[len], which means that 156 //last block[s] are actually 157 //loaded twice, but last 158 //copy is not processed 159 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 160 movi v19.16b,#0xe1 161 ld1 {v22.2d},[x1] 162 csel x12,xzr,x12,eq //is it time to zero x12? 163 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 164 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 165 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 166#ifndef __AARCH64EB__ 167 rev64 v16.16b,v16.16b 168 rev64 v0.16b,v0.16b 169#endif 170 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 171 b.lo Lodd_tail_v8 //x3 was less than 32 172 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 173#ifndef __AARCH64EB__ 174 rev64 v17.16b,v17.16b 175#endif 176 ext v7.16b,v17.16b,v17.16b,#8 177 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 178 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 179 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 180 pmull2 v6.1q,v20.2d,v7.2d 181 b Loop_mod2x_v8 182 183.align 4 184Loop_mod2x_v8: 185 ext v18.16b,v3.16b,v3.16b,#8 186 subs x3,x3,#32 //is there more data? 187 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 188 csel x12,xzr,x12,lo //is it time to zero x12? 189 190 pmull v5.1q,v21.1d,v17.1d 191 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 192 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 193 eor v0.16b,v0.16b,v4.16b //accumulate 194 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 195 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 196 197 eor v2.16b,v2.16b,v6.16b 198 csel x12,xzr,x12,eq //is it time to zero x12? 199 eor v1.16b,v1.16b,v5.16b 200 201 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 202 eor v18.16b,v0.16b,v2.16b 203 eor v1.16b,v1.16b,v17.16b 204 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 205#ifndef __AARCH64EB__ 206 rev64 v16.16b,v16.16b 207#endif 208 eor v1.16b,v1.16b,v18.16b 209 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 210 211#ifndef __AARCH64EB__ 212 rev64 v17.16b,v17.16b 213#endif 214 ins v2.d[0],v1.d[1] 215 ins v1.d[1],v0.d[0] 216 ext v7.16b,v17.16b,v17.16b,#8 217 ext v3.16b,v16.16b,v16.16b,#8 218 eor v0.16b,v1.16b,v18.16b 219 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 220 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 221 222 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 223 pmull v0.1q,v0.1d,v19.1d 224 eor v3.16b,v3.16b,v18.16b 225 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 226 eor v3.16b,v3.16b,v0.16b 227 pmull2 v6.1q,v20.2d,v7.2d 228 b.hs Loop_mod2x_v8 //there was at least 32 more bytes 229 230 eor v2.16b,v2.16b,v18.16b 231 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 232 adds x3,x3,#32 //re-construct x3 233 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 234 b.eq Ldone_v8 //is x3 zero? 235Lodd_tail_v8: 236 ext v18.16b,v0.16b,v0.16b,#8 237 eor v3.16b,v3.16b,v0.16b //inp^=Xi 238 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 239 240 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 241 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 242 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 243 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 244 245 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 246 eor v18.16b,v0.16b,v2.16b 247 eor v1.16b,v1.16b,v17.16b 248 eor v1.16b,v1.16b,v18.16b 249 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 250 251 ins v2.d[0],v1.d[1] 252 ins v1.d[1],v0.d[0] 253 eor v0.16b,v1.16b,v18.16b 254 255 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 256 pmull v0.1q,v0.1d,v19.1d 257 eor v18.16b,v18.16b,v2.16b 258 eor v0.16b,v0.16b,v18.16b 259 260Ldone_v8: 261#ifndef __AARCH64EB__ 262 rev64 v0.16b,v0.16b 263#endif 264 ext v0.16b,v0.16b,v0.16b,#8 265 st1 {v0.2d},[x0] //write out Xi 266 267 ret 268 269 270.align 4 271gcm_ghash_v8_4x: 272Lgcm_ghash_v8_4x: 273 ld1 {v0.2d},[x0] //load [rotated] Xi 274 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 275 movi v19.16b,#0xe1 276 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 277 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 278 279 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 280#ifndef __AARCH64EB__ 281 rev64 v0.16b,v0.16b 282 rev64 v5.16b,v5.16b 283 rev64 v6.16b,v6.16b 284 rev64 v7.16b,v7.16b 285 rev64 v4.16b,v4.16b 286#endif 287 ext v25.16b,v7.16b,v7.16b,#8 288 ext v24.16b,v6.16b,v6.16b,#8 289 ext v23.16b,v5.16b,v5.16b,#8 290 291 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 292 eor v7.16b,v7.16b,v25.16b 293 pmull2 v31.1q,v20.2d,v25.2d 294 pmull v30.1q,v21.1d,v7.1d 295 296 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 297 eor v6.16b,v6.16b,v24.16b 298 pmull2 v24.1q,v22.2d,v24.2d 299 pmull2 v6.1q,v21.2d,v6.2d 300 301 eor v29.16b,v29.16b,v16.16b 302 eor v31.16b,v31.16b,v24.16b 303 eor v30.16b,v30.16b,v6.16b 304 305 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 306 eor v5.16b,v5.16b,v23.16b 307 pmull2 v23.1q,v26.2d,v23.2d 308 pmull v5.1q,v27.1d,v5.1d 309 310 eor v29.16b,v29.16b,v7.16b 311 eor v31.16b,v31.16b,v23.16b 312 eor v30.16b,v30.16b,v5.16b 313 314 subs x3,x3,#128 315 b.lo Ltail4x 316 317 b Loop4x 318 319.align 4 320Loop4x: 321 eor v16.16b,v4.16b,v0.16b 322 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 323 ext v3.16b,v16.16b,v16.16b,#8 324#ifndef __AARCH64EB__ 325 rev64 v5.16b,v5.16b 326 rev64 v6.16b,v6.16b 327 rev64 v7.16b,v7.16b 328 rev64 v4.16b,v4.16b 329#endif 330 331 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 332 eor v16.16b,v16.16b,v3.16b 333 pmull2 v2.1q,v28.2d,v3.2d 334 ext v25.16b,v7.16b,v7.16b,#8 335 pmull2 v1.1q,v27.2d,v16.2d 336 337 eor v0.16b,v0.16b,v29.16b 338 eor v2.16b,v2.16b,v31.16b 339 ext v24.16b,v6.16b,v6.16b,#8 340 eor v1.16b,v1.16b,v30.16b 341 ext v23.16b,v5.16b,v5.16b,#8 342 343 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 344 eor v18.16b,v0.16b,v2.16b 345 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 346 eor v7.16b,v7.16b,v25.16b 347 eor v1.16b,v1.16b,v17.16b 348 pmull2 v31.1q,v20.2d,v25.2d 349 eor v1.16b,v1.16b,v18.16b 350 pmull v30.1q,v21.1d,v7.1d 351 352 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 353 ins v2.d[0],v1.d[1] 354 ins v1.d[1],v0.d[0] 355 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 356 eor v6.16b,v6.16b,v24.16b 357 pmull2 v24.1q,v22.2d,v24.2d 358 eor v0.16b,v1.16b,v18.16b 359 pmull2 v6.1q,v21.2d,v6.2d 360 361 eor v29.16b,v29.16b,v16.16b 362 eor v31.16b,v31.16b,v24.16b 363 eor v30.16b,v30.16b,v6.16b 364 365 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 366 pmull v0.1q,v0.1d,v19.1d 367 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 368 eor v5.16b,v5.16b,v23.16b 369 eor v18.16b,v18.16b,v2.16b 370 pmull2 v23.1q,v26.2d,v23.2d 371 pmull v5.1q,v27.1d,v5.1d 372 373 eor v0.16b,v0.16b,v18.16b 374 eor v29.16b,v29.16b,v7.16b 375 eor v31.16b,v31.16b,v23.16b 376 ext v0.16b,v0.16b,v0.16b,#8 377 eor v30.16b,v30.16b,v5.16b 378 379 subs x3,x3,#64 380 b.hs Loop4x 381 382Ltail4x: 383 eor v16.16b,v4.16b,v0.16b 384 ext v3.16b,v16.16b,v16.16b,#8 385 386 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 387 eor v16.16b,v16.16b,v3.16b 388 pmull2 v2.1q,v28.2d,v3.2d 389 pmull2 v1.1q,v27.2d,v16.2d 390 391 eor v0.16b,v0.16b,v29.16b 392 eor v2.16b,v2.16b,v31.16b 393 eor v1.16b,v1.16b,v30.16b 394 395 adds x3,x3,#64 396 b.eq Ldone4x 397 398 cmp x3,#32 399 b.lo Lone 400 b.eq Ltwo 401Lthree: 402 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 403 eor v18.16b,v0.16b,v2.16b 404 eor v1.16b,v1.16b,v17.16b 405 ld1 {v4.2d,v5.2d,v6.2d},[x2] 406 eor v1.16b,v1.16b,v18.16b 407#ifndef __AARCH64EB__ 408 rev64 v5.16b,v5.16b 409 rev64 v6.16b,v6.16b 410 rev64 v4.16b,v4.16b 411#endif 412 413 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 414 ins v2.d[0],v1.d[1] 415 ins v1.d[1],v0.d[0] 416 ext v24.16b,v6.16b,v6.16b,#8 417 ext v23.16b,v5.16b,v5.16b,#8 418 eor v0.16b,v1.16b,v18.16b 419 420 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 421 eor v6.16b,v6.16b,v24.16b 422 423 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 424 pmull v0.1q,v0.1d,v19.1d 425 eor v18.16b,v18.16b,v2.16b 426 pmull2 v31.1q,v20.2d,v24.2d 427 pmull v30.1q,v21.1d,v6.1d 428 eor v0.16b,v0.16b,v18.16b 429 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 430 eor v5.16b,v5.16b,v23.16b 431 ext v0.16b,v0.16b,v0.16b,#8 432 433 pmull2 v23.1q,v22.2d,v23.2d 434 eor v16.16b,v4.16b,v0.16b 435 pmull2 v5.1q,v21.2d,v5.2d 436 ext v3.16b,v16.16b,v16.16b,#8 437 438 eor v29.16b,v29.16b,v7.16b 439 eor v31.16b,v31.16b,v23.16b 440 eor v30.16b,v30.16b,v5.16b 441 442 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 443 eor v16.16b,v16.16b,v3.16b 444 pmull2 v2.1q,v26.2d,v3.2d 445 pmull v1.1q,v27.1d,v16.1d 446 447 eor v0.16b,v0.16b,v29.16b 448 eor v2.16b,v2.16b,v31.16b 449 eor v1.16b,v1.16b,v30.16b 450 b Ldone4x 451 452.align 4 453Ltwo: 454 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 455 eor v18.16b,v0.16b,v2.16b 456 eor v1.16b,v1.16b,v17.16b 457 ld1 {v4.2d,v5.2d},[x2] 458 eor v1.16b,v1.16b,v18.16b 459#ifndef __AARCH64EB__ 460 rev64 v5.16b,v5.16b 461 rev64 v4.16b,v4.16b 462#endif 463 464 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 465 ins v2.d[0],v1.d[1] 466 ins v1.d[1],v0.d[0] 467 ext v23.16b,v5.16b,v5.16b,#8 468 eor v0.16b,v1.16b,v18.16b 469 470 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 471 pmull v0.1q,v0.1d,v19.1d 472 eor v18.16b,v18.16b,v2.16b 473 eor v0.16b,v0.16b,v18.16b 474 ext v0.16b,v0.16b,v0.16b,#8 475 476 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 477 eor v5.16b,v5.16b,v23.16b 478 479 eor v16.16b,v4.16b,v0.16b 480 ext v3.16b,v16.16b,v16.16b,#8 481 482 pmull2 v31.1q,v20.2d,v23.2d 483 pmull v30.1q,v21.1d,v5.1d 484 485 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 486 eor v16.16b,v16.16b,v3.16b 487 pmull2 v2.1q,v22.2d,v3.2d 488 pmull2 v1.1q,v21.2d,v16.2d 489 490 eor v0.16b,v0.16b,v29.16b 491 eor v2.16b,v2.16b,v31.16b 492 eor v1.16b,v1.16b,v30.16b 493 b Ldone4x 494 495.align 4 496Lone: 497 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 498 eor v18.16b,v0.16b,v2.16b 499 eor v1.16b,v1.16b,v17.16b 500 ld1 {v4.2d},[x2] 501 eor v1.16b,v1.16b,v18.16b 502#ifndef __AARCH64EB__ 503 rev64 v4.16b,v4.16b 504#endif 505 506 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 507 ins v2.d[0],v1.d[1] 508 ins v1.d[1],v0.d[0] 509 eor v0.16b,v1.16b,v18.16b 510 511 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 512 pmull v0.1q,v0.1d,v19.1d 513 eor v18.16b,v18.16b,v2.16b 514 eor v0.16b,v0.16b,v18.16b 515 ext v0.16b,v0.16b,v0.16b,#8 516 517 eor v16.16b,v4.16b,v0.16b 518 ext v3.16b,v16.16b,v16.16b,#8 519 520 pmull v0.1q,v20.1d,v3.1d 521 eor v16.16b,v16.16b,v3.16b 522 pmull2 v2.1q,v20.2d,v3.2d 523 pmull v1.1q,v21.1d,v16.1d 524 525Ldone4x: 526 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 527 eor v18.16b,v0.16b,v2.16b 528 eor v1.16b,v1.16b,v17.16b 529 eor v1.16b,v1.16b,v18.16b 530 531 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 532 ins v2.d[0],v1.d[1] 533 ins v1.d[1],v0.d[0] 534 eor v0.16b,v1.16b,v18.16b 535 536 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 537 pmull v0.1q,v0.1d,v19.1d 538 eor v18.16b,v18.16b,v2.16b 539 eor v0.16b,v0.16b,v18.16b 540 ext v0.16b,v0.16b,v0.16b,#8 541 542#ifndef __AARCH64EB__ 543 rev64 v0.16b,v0.16b 544#endif 545 st1 {v0.2d},[x0] //write out Xi 546 547 ret 548 549.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 550.align 2 551.align 2 552#endif 553