1// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 2// 3// Licensed under the OpenSSL license (the "License"). You may not use 4// this file except in compliance with the License. You can obtain a copy 5// in the file LICENSE in the source distribution or at 6// https://www.openssl.org/source/license.html 7 8// ==================================================================== 9// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 10// project. The module is, however, dual licensed under OpenSSL and 11// CRYPTOGAMS licenses depending on where you obtain it. For further 12// details see http://www.openssl.org/~appro/cryptogams/. 13// 14// Permission to use under GPLv2 terms is granted. 15// ==================================================================== 16// 17// SHA256/512 for ARMv8. 18// 19// Performance in cycles per processed byte and improvement coefficient 20// over code generated with "default" compiler: 21// 22// SHA256-hw SHA256(*) SHA512 23// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) 24// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) 25// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) 26// Denver 2.01 10.5 (+26%) 6.70 (+8%) 27// X-Gene 20.0 (+100%) 12.8 (+300%(***)) 28// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) 29// Kryo 1.92 17.4 (+30%) 11.2 (+8%) 30// 31// (*) Software SHA256 results are of lesser relevance, presented 32// mostly for informational purposes. 33// (**) The result is a trade-off: it's possible to improve it by 34// 10% (or by 1 cycle per round), but at the cost of 20% loss 35// on Cortex-A53 (or by 4 cycles per round). 36// (***) Super-impressive coefficients over gcc-generated code are 37// indication of some compiler "pathology", most notably code 38// generated with -mgeneral-regs-only is significantly faster 39// and the gap is only 40-90%. 40// 41// October 2016. 42// 43// Originally it was reckoned that it makes no sense to implement NEON 44// version of SHA256 for 64-bit processors. This is because performance 45// improvement on most wide-spread Cortex-A5x processors was observed 46// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was 47// observed that 32-bit NEON SHA256 performs significantly better than 48// 64-bit scalar version on *some* of the more recent processors. As 49// result 64-bit NEON version of SHA256 was added to provide best 50// all-round performance. For example it executes ~30% faster on X-Gene 51// and Mongoose. [For reference, NEON version of SHA512 is bound to 52// deliver much less improvement, likely *negative* on Cortex-A5x. 53// Which is why NEON support is limited to SHA256.] 54 55#ifndef __KERNEL__ 56# include "arm_arch.h" 57#endif 58 59.text 60 61 62.hidden OPENSSL_armcap_P 63.globl sha512_block_data_order 64.type sha512_block_data_order,%function 65.align 6 66sha512_block_data_order: 67#ifndef __KERNEL__ 68# ifdef __ILP32__ 69 ldrsw x16,.LOPENSSL_armcap_P 70# else 71 ldr x16,.LOPENSSL_armcap_P 72# endif 73 adr x17,.LOPENSSL_armcap_P 74 add x16,x16,x17 75 ldr w16,[x16] 76 tst w16,#ARMV8_SHA512 77 b.ne .Lv8_entry 78#endif 79.inst 0xd503233f // paciasp 80 stp x29,x30,[sp,#-128]! 81 add x29,sp,#0 82 83 stp x19,x20,[sp,#16] 84 stp x21,x22,[sp,#32] 85 stp x23,x24,[sp,#48] 86 stp x25,x26,[sp,#64] 87 stp x27,x28,[sp,#80] 88 sub sp,sp,#4*8 89 90 ldp x20,x21,[x0] // load context 91 ldp x22,x23,[x0,#2*8] 92 ldp x24,x25,[x0,#4*8] 93 add x2,x1,x2,lsl#7 // end of input 94 ldp x26,x27,[x0,#6*8] 95 adr x30,.LK512 96 stp x0,x2,[x29,#96] 97 98.Loop: 99 ldp x3,x4,[x1],#2*8 100 ldr x19,[x30],#8 // *K++ 101 eor x28,x21,x22 // magic seed 102 str x1,[x29,#112] 103#ifndef __AARCH64EB__ 104 rev x3,x3 // 0 105#endif 106 ror x16,x24,#14 107 add x27,x27,x19 // h+=K[i] 108 eor x6,x24,x24,ror#23 109 and x17,x25,x24 110 bic x19,x26,x24 111 add x27,x27,x3 // h+=X[i] 112 orr x17,x17,x19 // Ch(e,f,g) 113 eor x19,x20,x21 // a^b, b^c in next round 114 eor x16,x16,x6,ror#18 // Sigma1(e) 115 ror x6,x20,#28 116 add x27,x27,x17 // h+=Ch(e,f,g) 117 eor x17,x20,x20,ror#5 118 add x27,x27,x16 // h+=Sigma1(e) 119 and x28,x28,x19 // (b^c)&=(a^b) 120 add x23,x23,x27 // d+=h 121 eor x28,x28,x21 // Maj(a,b,c) 122 eor x17,x6,x17,ror#34 // Sigma0(a) 123 add x27,x27,x28 // h+=Maj(a,b,c) 124 ldr x28,[x30],#8 // *K++, x19 in next round 125 //add x27,x27,x17 // h+=Sigma0(a) 126#ifndef __AARCH64EB__ 127 rev x4,x4 // 1 128#endif 129 ldp x5,x6,[x1],#2*8 130 add x27,x27,x17 // h+=Sigma0(a) 131 ror x16,x23,#14 132 add x26,x26,x28 // h+=K[i] 133 eor x7,x23,x23,ror#23 134 and x17,x24,x23 135 bic x28,x25,x23 136 add x26,x26,x4 // h+=X[i] 137 orr x17,x17,x28 // Ch(e,f,g) 138 eor x28,x27,x20 // a^b, b^c in next round 139 eor x16,x16,x7,ror#18 // Sigma1(e) 140 ror x7,x27,#28 141 add x26,x26,x17 // h+=Ch(e,f,g) 142 eor x17,x27,x27,ror#5 143 add x26,x26,x16 // h+=Sigma1(e) 144 and x19,x19,x28 // (b^c)&=(a^b) 145 add x22,x22,x26 // d+=h 146 eor x19,x19,x20 // Maj(a,b,c) 147 eor x17,x7,x17,ror#34 // Sigma0(a) 148 add x26,x26,x19 // h+=Maj(a,b,c) 149 ldr x19,[x30],#8 // *K++, x28 in next round 150 //add x26,x26,x17 // h+=Sigma0(a) 151#ifndef __AARCH64EB__ 152 rev x5,x5 // 2 153#endif 154 add x26,x26,x17 // h+=Sigma0(a) 155 ror x16,x22,#14 156 add x25,x25,x19 // h+=K[i] 157 eor x8,x22,x22,ror#23 158 and x17,x23,x22 159 bic x19,x24,x22 160 add x25,x25,x5 // h+=X[i] 161 orr x17,x17,x19 // Ch(e,f,g) 162 eor x19,x26,x27 // a^b, b^c in next round 163 eor x16,x16,x8,ror#18 // Sigma1(e) 164 ror x8,x26,#28 165 add x25,x25,x17 // h+=Ch(e,f,g) 166 eor x17,x26,x26,ror#5 167 add x25,x25,x16 // h+=Sigma1(e) 168 and x28,x28,x19 // (b^c)&=(a^b) 169 add x21,x21,x25 // d+=h 170 eor x28,x28,x27 // Maj(a,b,c) 171 eor x17,x8,x17,ror#34 // Sigma0(a) 172 add x25,x25,x28 // h+=Maj(a,b,c) 173 ldr x28,[x30],#8 // *K++, x19 in next round 174 //add x25,x25,x17 // h+=Sigma0(a) 175#ifndef __AARCH64EB__ 176 rev x6,x6 // 3 177#endif 178 ldp x7,x8,[x1],#2*8 179 add x25,x25,x17 // h+=Sigma0(a) 180 ror x16,x21,#14 181 add x24,x24,x28 // h+=K[i] 182 eor x9,x21,x21,ror#23 183 and x17,x22,x21 184 bic x28,x23,x21 185 add x24,x24,x6 // h+=X[i] 186 orr x17,x17,x28 // Ch(e,f,g) 187 eor x28,x25,x26 // a^b, b^c in next round 188 eor x16,x16,x9,ror#18 // Sigma1(e) 189 ror x9,x25,#28 190 add x24,x24,x17 // h+=Ch(e,f,g) 191 eor x17,x25,x25,ror#5 192 add x24,x24,x16 // h+=Sigma1(e) 193 and x19,x19,x28 // (b^c)&=(a^b) 194 add x20,x20,x24 // d+=h 195 eor x19,x19,x26 // Maj(a,b,c) 196 eor x17,x9,x17,ror#34 // Sigma0(a) 197 add x24,x24,x19 // h+=Maj(a,b,c) 198 ldr x19,[x30],#8 // *K++, x28 in next round 199 //add x24,x24,x17 // h+=Sigma0(a) 200#ifndef __AARCH64EB__ 201 rev x7,x7 // 4 202#endif 203 add x24,x24,x17 // h+=Sigma0(a) 204 ror x16,x20,#14 205 add x23,x23,x19 // h+=K[i] 206 eor x10,x20,x20,ror#23 207 and x17,x21,x20 208 bic x19,x22,x20 209 add x23,x23,x7 // h+=X[i] 210 orr x17,x17,x19 // Ch(e,f,g) 211 eor x19,x24,x25 // a^b, b^c in next round 212 eor x16,x16,x10,ror#18 // Sigma1(e) 213 ror x10,x24,#28 214 add x23,x23,x17 // h+=Ch(e,f,g) 215 eor x17,x24,x24,ror#5 216 add x23,x23,x16 // h+=Sigma1(e) 217 and x28,x28,x19 // (b^c)&=(a^b) 218 add x27,x27,x23 // d+=h 219 eor x28,x28,x25 // Maj(a,b,c) 220 eor x17,x10,x17,ror#34 // Sigma0(a) 221 add x23,x23,x28 // h+=Maj(a,b,c) 222 ldr x28,[x30],#8 // *K++, x19 in next round 223 //add x23,x23,x17 // h+=Sigma0(a) 224#ifndef __AARCH64EB__ 225 rev x8,x8 // 5 226#endif 227 ldp x9,x10,[x1],#2*8 228 add x23,x23,x17 // h+=Sigma0(a) 229 ror x16,x27,#14 230 add x22,x22,x28 // h+=K[i] 231 eor x11,x27,x27,ror#23 232 and x17,x20,x27 233 bic x28,x21,x27 234 add x22,x22,x8 // h+=X[i] 235 orr x17,x17,x28 // Ch(e,f,g) 236 eor x28,x23,x24 // a^b, b^c in next round 237 eor x16,x16,x11,ror#18 // Sigma1(e) 238 ror x11,x23,#28 239 add x22,x22,x17 // h+=Ch(e,f,g) 240 eor x17,x23,x23,ror#5 241 add x22,x22,x16 // h+=Sigma1(e) 242 and x19,x19,x28 // (b^c)&=(a^b) 243 add x26,x26,x22 // d+=h 244 eor x19,x19,x24 // Maj(a,b,c) 245 eor x17,x11,x17,ror#34 // Sigma0(a) 246 add x22,x22,x19 // h+=Maj(a,b,c) 247 ldr x19,[x30],#8 // *K++, x28 in next round 248 //add x22,x22,x17 // h+=Sigma0(a) 249#ifndef __AARCH64EB__ 250 rev x9,x9 // 6 251#endif 252 add x22,x22,x17 // h+=Sigma0(a) 253 ror x16,x26,#14 254 add x21,x21,x19 // h+=K[i] 255 eor x12,x26,x26,ror#23 256 and x17,x27,x26 257 bic x19,x20,x26 258 add x21,x21,x9 // h+=X[i] 259 orr x17,x17,x19 // Ch(e,f,g) 260 eor x19,x22,x23 // a^b, b^c in next round 261 eor x16,x16,x12,ror#18 // Sigma1(e) 262 ror x12,x22,#28 263 add x21,x21,x17 // h+=Ch(e,f,g) 264 eor x17,x22,x22,ror#5 265 add x21,x21,x16 // h+=Sigma1(e) 266 and x28,x28,x19 // (b^c)&=(a^b) 267 add x25,x25,x21 // d+=h 268 eor x28,x28,x23 // Maj(a,b,c) 269 eor x17,x12,x17,ror#34 // Sigma0(a) 270 add x21,x21,x28 // h+=Maj(a,b,c) 271 ldr x28,[x30],#8 // *K++, x19 in next round 272 //add x21,x21,x17 // h+=Sigma0(a) 273#ifndef __AARCH64EB__ 274 rev x10,x10 // 7 275#endif 276 ldp x11,x12,[x1],#2*8 277 add x21,x21,x17 // h+=Sigma0(a) 278 ror x16,x25,#14 279 add x20,x20,x28 // h+=K[i] 280 eor x13,x25,x25,ror#23 281 and x17,x26,x25 282 bic x28,x27,x25 283 add x20,x20,x10 // h+=X[i] 284 orr x17,x17,x28 // Ch(e,f,g) 285 eor x28,x21,x22 // a^b, b^c in next round 286 eor x16,x16,x13,ror#18 // Sigma1(e) 287 ror x13,x21,#28 288 add x20,x20,x17 // h+=Ch(e,f,g) 289 eor x17,x21,x21,ror#5 290 add x20,x20,x16 // h+=Sigma1(e) 291 and x19,x19,x28 // (b^c)&=(a^b) 292 add x24,x24,x20 // d+=h 293 eor x19,x19,x22 // Maj(a,b,c) 294 eor x17,x13,x17,ror#34 // Sigma0(a) 295 add x20,x20,x19 // h+=Maj(a,b,c) 296 ldr x19,[x30],#8 // *K++, x28 in next round 297 //add x20,x20,x17 // h+=Sigma0(a) 298#ifndef __AARCH64EB__ 299 rev x11,x11 // 8 300#endif 301 add x20,x20,x17 // h+=Sigma0(a) 302 ror x16,x24,#14 303 add x27,x27,x19 // h+=K[i] 304 eor x14,x24,x24,ror#23 305 and x17,x25,x24 306 bic x19,x26,x24 307 add x27,x27,x11 // h+=X[i] 308 orr x17,x17,x19 // Ch(e,f,g) 309 eor x19,x20,x21 // a^b, b^c in next round 310 eor x16,x16,x14,ror#18 // Sigma1(e) 311 ror x14,x20,#28 312 add x27,x27,x17 // h+=Ch(e,f,g) 313 eor x17,x20,x20,ror#5 314 add x27,x27,x16 // h+=Sigma1(e) 315 and x28,x28,x19 // (b^c)&=(a^b) 316 add x23,x23,x27 // d+=h 317 eor x28,x28,x21 // Maj(a,b,c) 318 eor x17,x14,x17,ror#34 // Sigma0(a) 319 add x27,x27,x28 // h+=Maj(a,b,c) 320 ldr x28,[x30],#8 // *K++, x19 in next round 321 //add x27,x27,x17 // h+=Sigma0(a) 322#ifndef __AARCH64EB__ 323 rev x12,x12 // 9 324#endif 325 ldp x13,x14,[x1],#2*8 326 add x27,x27,x17 // h+=Sigma0(a) 327 ror x16,x23,#14 328 add x26,x26,x28 // h+=K[i] 329 eor x15,x23,x23,ror#23 330 and x17,x24,x23 331 bic x28,x25,x23 332 add x26,x26,x12 // h+=X[i] 333 orr x17,x17,x28 // Ch(e,f,g) 334 eor x28,x27,x20 // a^b, b^c in next round 335 eor x16,x16,x15,ror#18 // Sigma1(e) 336 ror x15,x27,#28 337 add x26,x26,x17 // h+=Ch(e,f,g) 338 eor x17,x27,x27,ror#5 339 add x26,x26,x16 // h+=Sigma1(e) 340 and x19,x19,x28 // (b^c)&=(a^b) 341 add x22,x22,x26 // d+=h 342 eor x19,x19,x20 // Maj(a,b,c) 343 eor x17,x15,x17,ror#34 // Sigma0(a) 344 add x26,x26,x19 // h+=Maj(a,b,c) 345 ldr x19,[x30],#8 // *K++, x28 in next round 346 //add x26,x26,x17 // h+=Sigma0(a) 347#ifndef __AARCH64EB__ 348 rev x13,x13 // 10 349#endif 350 add x26,x26,x17 // h+=Sigma0(a) 351 ror x16,x22,#14 352 add x25,x25,x19 // h+=K[i] 353 eor x0,x22,x22,ror#23 354 and x17,x23,x22 355 bic x19,x24,x22 356 add x25,x25,x13 // h+=X[i] 357 orr x17,x17,x19 // Ch(e,f,g) 358 eor x19,x26,x27 // a^b, b^c in next round 359 eor x16,x16,x0,ror#18 // Sigma1(e) 360 ror x0,x26,#28 361 add x25,x25,x17 // h+=Ch(e,f,g) 362 eor x17,x26,x26,ror#5 363 add x25,x25,x16 // h+=Sigma1(e) 364 and x28,x28,x19 // (b^c)&=(a^b) 365 add x21,x21,x25 // d+=h 366 eor x28,x28,x27 // Maj(a,b,c) 367 eor x17,x0,x17,ror#34 // Sigma0(a) 368 add x25,x25,x28 // h+=Maj(a,b,c) 369 ldr x28,[x30],#8 // *K++, x19 in next round 370 //add x25,x25,x17 // h+=Sigma0(a) 371#ifndef __AARCH64EB__ 372 rev x14,x14 // 11 373#endif 374 ldp x15,x0,[x1],#2*8 375 add x25,x25,x17 // h+=Sigma0(a) 376 str x6,[sp,#24] 377 ror x16,x21,#14 378 add x24,x24,x28 // h+=K[i] 379 eor x6,x21,x21,ror#23 380 and x17,x22,x21 381 bic x28,x23,x21 382 add x24,x24,x14 // h+=X[i] 383 orr x17,x17,x28 // Ch(e,f,g) 384 eor x28,x25,x26 // a^b, b^c in next round 385 eor x16,x16,x6,ror#18 // Sigma1(e) 386 ror x6,x25,#28 387 add x24,x24,x17 // h+=Ch(e,f,g) 388 eor x17,x25,x25,ror#5 389 add x24,x24,x16 // h+=Sigma1(e) 390 and x19,x19,x28 // (b^c)&=(a^b) 391 add x20,x20,x24 // d+=h 392 eor x19,x19,x26 // Maj(a,b,c) 393 eor x17,x6,x17,ror#34 // Sigma0(a) 394 add x24,x24,x19 // h+=Maj(a,b,c) 395 ldr x19,[x30],#8 // *K++, x28 in next round 396 //add x24,x24,x17 // h+=Sigma0(a) 397#ifndef __AARCH64EB__ 398 rev x15,x15 // 12 399#endif 400 add x24,x24,x17 // h+=Sigma0(a) 401 str x7,[sp,#0] 402 ror x16,x20,#14 403 add x23,x23,x19 // h+=K[i] 404 eor x7,x20,x20,ror#23 405 and x17,x21,x20 406 bic x19,x22,x20 407 add x23,x23,x15 // h+=X[i] 408 orr x17,x17,x19 // Ch(e,f,g) 409 eor x19,x24,x25 // a^b, b^c in next round 410 eor x16,x16,x7,ror#18 // Sigma1(e) 411 ror x7,x24,#28 412 add x23,x23,x17 // h+=Ch(e,f,g) 413 eor x17,x24,x24,ror#5 414 add x23,x23,x16 // h+=Sigma1(e) 415 and x28,x28,x19 // (b^c)&=(a^b) 416 add x27,x27,x23 // d+=h 417 eor x28,x28,x25 // Maj(a,b,c) 418 eor x17,x7,x17,ror#34 // Sigma0(a) 419 add x23,x23,x28 // h+=Maj(a,b,c) 420 ldr x28,[x30],#8 // *K++, x19 in next round 421 //add x23,x23,x17 // h+=Sigma0(a) 422#ifndef __AARCH64EB__ 423 rev x0,x0 // 13 424#endif 425 ldp x1,x2,[x1] 426 add x23,x23,x17 // h+=Sigma0(a) 427 str x8,[sp,#8] 428 ror x16,x27,#14 429 add x22,x22,x28 // h+=K[i] 430 eor x8,x27,x27,ror#23 431 and x17,x20,x27 432 bic x28,x21,x27 433 add x22,x22,x0 // h+=X[i] 434 orr x17,x17,x28 // Ch(e,f,g) 435 eor x28,x23,x24 // a^b, b^c in next round 436 eor x16,x16,x8,ror#18 // Sigma1(e) 437 ror x8,x23,#28 438 add x22,x22,x17 // h+=Ch(e,f,g) 439 eor x17,x23,x23,ror#5 440 add x22,x22,x16 // h+=Sigma1(e) 441 and x19,x19,x28 // (b^c)&=(a^b) 442 add x26,x26,x22 // d+=h 443 eor x19,x19,x24 // Maj(a,b,c) 444 eor x17,x8,x17,ror#34 // Sigma0(a) 445 add x22,x22,x19 // h+=Maj(a,b,c) 446 ldr x19,[x30],#8 // *K++, x28 in next round 447 //add x22,x22,x17 // h+=Sigma0(a) 448#ifndef __AARCH64EB__ 449 rev x1,x1 // 14 450#endif 451 ldr x6,[sp,#24] 452 add x22,x22,x17 // h+=Sigma0(a) 453 str x9,[sp,#16] 454 ror x16,x26,#14 455 add x21,x21,x19 // h+=K[i] 456 eor x9,x26,x26,ror#23 457 and x17,x27,x26 458 bic x19,x20,x26 459 add x21,x21,x1 // h+=X[i] 460 orr x17,x17,x19 // Ch(e,f,g) 461 eor x19,x22,x23 // a^b, b^c in next round 462 eor x16,x16,x9,ror#18 // Sigma1(e) 463 ror x9,x22,#28 464 add x21,x21,x17 // h+=Ch(e,f,g) 465 eor x17,x22,x22,ror#5 466 add x21,x21,x16 // h+=Sigma1(e) 467 and x28,x28,x19 // (b^c)&=(a^b) 468 add x25,x25,x21 // d+=h 469 eor x28,x28,x23 // Maj(a,b,c) 470 eor x17,x9,x17,ror#34 // Sigma0(a) 471 add x21,x21,x28 // h+=Maj(a,b,c) 472 ldr x28,[x30],#8 // *K++, x19 in next round 473 //add x21,x21,x17 // h+=Sigma0(a) 474#ifndef __AARCH64EB__ 475 rev x2,x2 // 15 476#endif 477 ldr x7,[sp,#0] 478 add x21,x21,x17 // h+=Sigma0(a) 479 str x10,[sp,#24] 480 ror x16,x25,#14 481 add x20,x20,x28 // h+=K[i] 482 ror x9,x4,#1 483 and x17,x26,x25 484 ror x8,x1,#19 485 bic x28,x27,x25 486 ror x10,x21,#28 487 add x20,x20,x2 // h+=X[i] 488 eor x16,x16,x25,ror#18 489 eor x9,x9,x4,ror#8 490 orr x17,x17,x28 // Ch(e,f,g) 491 eor x28,x21,x22 // a^b, b^c in next round 492 eor x16,x16,x25,ror#41 // Sigma1(e) 493 eor x10,x10,x21,ror#34 494 add x20,x20,x17 // h+=Ch(e,f,g) 495 and x19,x19,x28 // (b^c)&=(a^b) 496 eor x8,x8,x1,ror#61 497 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) 498 add x20,x20,x16 // h+=Sigma1(e) 499 eor x19,x19,x22 // Maj(a,b,c) 500 eor x17,x10,x21,ror#39 // Sigma0(a) 501 eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) 502 add x3,x3,x12 503 add x24,x24,x20 // d+=h 504 add x20,x20,x19 // h+=Maj(a,b,c) 505 ldr x19,[x30],#8 // *K++, x28 in next round 506 add x3,x3,x9 507 add x20,x20,x17 // h+=Sigma0(a) 508 add x3,x3,x8 509.Loop_16_xx: 510 ldr x8,[sp,#8] 511 str x11,[sp,#0] 512 ror x16,x24,#14 513 add x27,x27,x19 // h+=K[i] 514 ror x10,x5,#1 515 and x17,x25,x24 516 ror x9,x2,#19 517 bic x19,x26,x24 518 ror x11,x20,#28 519 add x27,x27,x3 // h+=X[i] 520 eor x16,x16,x24,ror#18 521 eor x10,x10,x5,ror#8 522 orr x17,x17,x19 // Ch(e,f,g) 523 eor x19,x20,x21 // a^b, b^c in next round 524 eor x16,x16,x24,ror#41 // Sigma1(e) 525 eor x11,x11,x20,ror#34 526 add x27,x27,x17 // h+=Ch(e,f,g) 527 and x28,x28,x19 // (b^c)&=(a^b) 528 eor x9,x9,x2,ror#61 529 eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) 530 add x27,x27,x16 // h+=Sigma1(e) 531 eor x28,x28,x21 // Maj(a,b,c) 532 eor x17,x11,x20,ror#39 // Sigma0(a) 533 eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) 534 add x4,x4,x13 535 add x23,x23,x27 // d+=h 536 add x27,x27,x28 // h+=Maj(a,b,c) 537 ldr x28,[x30],#8 // *K++, x19 in next round 538 add x4,x4,x10 539 add x27,x27,x17 // h+=Sigma0(a) 540 add x4,x4,x9 541 ldr x9,[sp,#16] 542 str x12,[sp,#8] 543 ror x16,x23,#14 544 add x26,x26,x28 // h+=K[i] 545 ror x11,x6,#1 546 and x17,x24,x23 547 ror x10,x3,#19 548 bic x28,x25,x23 549 ror x12,x27,#28 550 add x26,x26,x4 // h+=X[i] 551 eor x16,x16,x23,ror#18 552 eor x11,x11,x6,ror#8 553 orr x17,x17,x28 // Ch(e,f,g) 554 eor x28,x27,x20 // a^b, b^c in next round 555 eor x16,x16,x23,ror#41 // Sigma1(e) 556 eor x12,x12,x27,ror#34 557 add x26,x26,x17 // h+=Ch(e,f,g) 558 and x19,x19,x28 // (b^c)&=(a^b) 559 eor x10,x10,x3,ror#61 560 eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) 561 add x26,x26,x16 // h+=Sigma1(e) 562 eor x19,x19,x20 // Maj(a,b,c) 563 eor x17,x12,x27,ror#39 // Sigma0(a) 564 eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) 565 add x5,x5,x14 566 add x22,x22,x26 // d+=h 567 add x26,x26,x19 // h+=Maj(a,b,c) 568 ldr x19,[x30],#8 // *K++, x28 in next round 569 add x5,x5,x11 570 add x26,x26,x17 // h+=Sigma0(a) 571 add x5,x5,x10 572 ldr x10,[sp,#24] 573 str x13,[sp,#16] 574 ror x16,x22,#14 575 add x25,x25,x19 // h+=K[i] 576 ror x12,x7,#1 577 and x17,x23,x22 578 ror x11,x4,#19 579 bic x19,x24,x22 580 ror x13,x26,#28 581 add x25,x25,x5 // h+=X[i] 582 eor x16,x16,x22,ror#18 583 eor x12,x12,x7,ror#8 584 orr x17,x17,x19 // Ch(e,f,g) 585 eor x19,x26,x27 // a^b, b^c in next round 586 eor x16,x16,x22,ror#41 // Sigma1(e) 587 eor x13,x13,x26,ror#34 588 add x25,x25,x17 // h+=Ch(e,f,g) 589 and x28,x28,x19 // (b^c)&=(a^b) 590 eor x11,x11,x4,ror#61 591 eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) 592 add x25,x25,x16 // h+=Sigma1(e) 593 eor x28,x28,x27 // Maj(a,b,c) 594 eor x17,x13,x26,ror#39 // Sigma0(a) 595 eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) 596 add x6,x6,x15 597 add x21,x21,x25 // d+=h 598 add x25,x25,x28 // h+=Maj(a,b,c) 599 ldr x28,[x30],#8 // *K++, x19 in next round 600 add x6,x6,x12 601 add x25,x25,x17 // h+=Sigma0(a) 602 add x6,x6,x11 603 ldr x11,[sp,#0] 604 str x14,[sp,#24] 605 ror x16,x21,#14 606 add x24,x24,x28 // h+=K[i] 607 ror x13,x8,#1 608 and x17,x22,x21 609 ror x12,x5,#19 610 bic x28,x23,x21 611 ror x14,x25,#28 612 add x24,x24,x6 // h+=X[i] 613 eor x16,x16,x21,ror#18 614 eor x13,x13,x8,ror#8 615 orr x17,x17,x28 // Ch(e,f,g) 616 eor x28,x25,x26 // a^b, b^c in next round 617 eor x16,x16,x21,ror#41 // Sigma1(e) 618 eor x14,x14,x25,ror#34 619 add x24,x24,x17 // h+=Ch(e,f,g) 620 and x19,x19,x28 // (b^c)&=(a^b) 621 eor x12,x12,x5,ror#61 622 eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) 623 add x24,x24,x16 // h+=Sigma1(e) 624 eor x19,x19,x26 // Maj(a,b,c) 625 eor x17,x14,x25,ror#39 // Sigma0(a) 626 eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) 627 add x7,x7,x0 628 add x20,x20,x24 // d+=h 629 add x24,x24,x19 // h+=Maj(a,b,c) 630 ldr x19,[x30],#8 // *K++, x28 in next round 631 add x7,x7,x13 632 add x24,x24,x17 // h+=Sigma0(a) 633 add x7,x7,x12 634 ldr x12,[sp,#8] 635 str x15,[sp,#0] 636 ror x16,x20,#14 637 add x23,x23,x19 // h+=K[i] 638 ror x14,x9,#1 639 and x17,x21,x20 640 ror x13,x6,#19 641 bic x19,x22,x20 642 ror x15,x24,#28 643 add x23,x23,x7 // h+=X[i] 644 eor x16,x16,x20,ror#18 645 eor x14,x14,x9,ror#8 646 orr x17,x17,x19 // Ch(e,f,g) 647 eor x19,x24,x25 // a^b, b^c in next round 648 eor x16,x16,x20,ror#41 // Sigma1(e) 649 eor x15,x15,x24,ror#34 650 add x23,x23,x17 // h+=Ch(e,f,g) 651 and x28,x28,x19 // (b^c)&=(a^b) 652 eor x13,x13,x6,ror#61 653 eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) 654 add x23,x23,x16 // h+=Sigma1(e) 655 eor x28,x28,x25 // Maj(a,b,c) 656 eor x17,x15,x24,ror#39 // Sigma0(a) 657 eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) 658 add x8,x8,x1 659 add x27,x27,x23 // d+=h 660 add x23,x23,x28 // h+=Maj(a,b,c) 661 ldr x28,[x30],#8 // *K++, x19 in next round 662 add x8,x8,x14 663 add x23,x23,x17 // h+=Sigma0(a) 664 add x8,x8,x13 665 ldr x13,[sp,#16] 666 str x0,[sp,#8] 667 ror x16,x27,#14 668 add x22,x22,x28 // h+=K[i] 669 ror x15,x10,#1 670 and x17,x20,x27 671 ror x14,x7,#19 672 bic x28,x21,x27 673 ror x0,x23,#28 674 add x22,x22,x8 // h+=X[i] 675 eor x16,x16,x27,ror#18 676 eor x15,x15,x10,ror#8 677 orr x17,x17,x28 // Ch(e,f,g) 678 eor x28,x23,x24 // a^b, b^c in next round 679 eor x16,x16,x27,ror#41 // Sigma1(e) 680 eor x0,x0,x23,ror#34 681 add x22,x22,x17 // h+=Ch(e,f,g) 682 and x19,x19,x28 // (b^c)&=(a^b) 683 eor x14,x14,x7,ror#61 684 eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) 685 add x22,x22,x16 // h+=Sigma1(e) 686 eor x19,x19,x24 // Maj(a,b,c) 687 eor x17,x0,x23,ror#39 // Sigma0(a) 688 eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) 689 add x9,x9,x2 690 add x26,x26,x22 // d+=h 691 add x22,x22,x19 // h+=Maj(a,b,c) 692 ldr x19,[x30],#8 // *K++, x28 in next round 693 add x9,x9,x15 694 add x22,x22,x17 // h+=Sigma0(a) 695 add x9,x9,x14 696 ldr x14,[sp,#24] 697 str x1,[sp,#16] 698 ror x16,x26,#14 699 add x21,x21,x19 // h+=K[i] 700 ror x0,x11,#1 701 and x17,x27,x26 702 ror x15,x8,#19 703 bic x19,x20,x26 704 ror x1,x22,#28 705 add x21,x21,x9 // h+=X[i] 706 eor x16,x16,x26,ror#18 707 eor x0,x0,x11,ror#8 708 orr x17,x17,x19 // Ch(e,f,g) 709 eor x19,x22,x23 // a^b, b^c in next round 710 eor x16,x16,x26,ror#41 // Sigma1(e) 711 eor x1,x1,x22,ror#34 712 add x21,x21,x17 // h+=Ch(e,f,g) 713 and x28,x28,x19 // (b^c)&=(a^b) 714 eor x15,x15,x8,ror#61 715 eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) 716 add x21,x21,x16 // h+=Sigma1(e) 717 eor x28,x28,x23 // Maj(a,b,c) 718 eor x17,x1,x22,ror#39 // Sigma0(a) 719 eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) 720 add x10,x10,x3 721 add x25,x25,x21 // d+=h 722 add x21,x21,x28 // h+=Maj(a,b,c) 723 ldr x28,[x30],#8 // *K++, x19 in next round 724 add x10,x10,x0 725 add x21,x21,x17 // h+=Sigma0(a) 726 add x10,x10,x15 727 ldr x15,[sp,#0] 728 str x2,[sp,#24] 729 ror x16,x25,#14 730 add x20,x20,x28 // h+=K[i] 731 ror x1,x12,#1 732 and x17,x26,x25 733 ror x0,x9,#19 734 bic x28,x27,x25 735 ror x2,x21,#28 736 add x20,x20,x10 // h+=X[i] 737 eor x16,x16,x25,ror#18 738 eor x1,x1,x12,ror#8 739 orr x17,x17,x28 // Ch(e,f,g) 740 eor x28,x21,x22 // a^b, b^c in next round 741 eor x16,x16,x25,ror#41 // Sigma1(e) 742 eor x2,x2,x21,ror#34 743 add x20,x20,x17 // h+=Ch(e,f,g) 744 and x19,x19,x28 // (b^c)&=(a^b) 745 eor x0,x0,x9,ror#61 746 eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) 747 add x20,x20,x16 // h+=Sigma1(e) 748 eor x19,x19,x22 // Maj(a,b,c) 749 eor x17,x2,x21,ror#39 // Sigma0(a) 750 eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) 751 add x11,x11,x4 752 add x24,x24,x20 // d+=h 753 add x20,x20,x19 // h+=Maj(a,b,c) 754 ldr x19,[x30],#8 // *K++, x28 in next round 755 add x11,x11,x1 756 add x20,x20,x17 // h+=Sigma0(a) 757 add x11,x11,x0 758 ldr x0,[sp,#8] 759 str x3,[sp,#0] 760 ror x16,x24,#14 761 add x27,x27,x19 // h+=K[i] 762 ror x2,x13,#1 763 and x17,x25,x24 764 ror x1,x10,#19 765 bic x19,x26,x24 766 ror x3,x20,#28 767 add x27,x27,x11 // h+=X[i] 768 eor x16,x16,x24,ror#18 769 eor x2,x2,x13,ror#8 770 orr x17,x17,x19 // Ch(e,f,g) 771 eor x19,x20,x21 // a^b, b^c in next round 772 eor x16,x16,x24,ror#41 // Sigma1(e) 773 eor x3,x3,x20,ror#34 774 add x27,x27,x17 // h+=Ch(e,f,g) 775 and x28,x28,x19 // (b^c)&=(a^b) 776 eor x1,x1,x10,ror#61 777 eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) 778 add x27,x27,x16 // h+=Sigma1(e) 779 eor x28,x28,x21 // Maj(a,b,c) 780 eor x17,x3,x20,ror#39 // Sigma0(a) 781 eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) 782 add x12,x12,x5 783 add x23,x23,x27 // d+=h 784 add x27,x27,x28 // h+=Maj(a,b,c) 785 ldr x28,[x30],#8 // *K++, x19 in next round 786 add x12,x12,x2 787 add x27,x27,x17 // h+=Sigma0(a) 788 add x12,x12,x1 789 ldr x1,[sp,#16] 790 str x4,[sp,#8] 791 ror x16,x23,#14 792 add x26,x26,x28 // h+=K[i] 793 ror x3,x14,#1 794 and x17,x24,x23 795 ror x2,x11,#19 796 bic x28,x25,x23 797 ror x4,x27,#28 798 add x26,x26,x12 // h+=X[i] 799 eor x16,x16,x23,ror#18 800 eor x3,x3,x14,ror#8 801 orr x17,x17,x28 // Ch(e,f,g) 802 eor x28,x27,x20 // a^b, b^c in next round 803 eor x16,x16,x23,ror#41 // Sigma1(e) 804 eor x4,x4,x27,ror#34 805 add x26,x26,x17 // h+=Ch(e,f,g) 806 and x19,x19,x28 // (b^c)&=(a^b) 807 eor x2,x2,x11,ror#61 808 eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) 809 add x26,x26,x16 // h+=Sigma1(e) 810 eor x19,x19,x20 // Maj(a,b,c) 811 eor x17,x4,x27,ror#39 // Sigma0(a) 812 eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) 813 add x13,x13,x6 814 add x22,x22,x26 // d+=h 815 add x26,x26,x19 // h+=Maj(a,b,c) 816 ldr x19,[x30],#8 // *K++, x28 in next round 817 add x13,x13,x3 818 add x26,x26,x17 // h+=Sigma0(a) 819 add x13,x13,x2 820 ldr x2,[sp,#24] 821 str x5,[sp,#16] 822 ror x16,x22,#14 823 add x25,x25,x19 // h+=K[i] 824 ror x4,x15,#1 825 and x17,x23,x22 826 ror x3,x12,#19 827 bic x19,x24,x22 828 ror x5,x26,#28 829 add x25,x25,x13 // h+=X[i] 830 eor x16,x16,x22,ror#18 831 eor x4,x4,x15,ror#8 832 orr x17,x17,x19 // Ch(e,f,g) 833 eor x19,x26,x27 // a^b, b^c in next round 834 eor x16,x16,x22,ror#41 // Sigma1(e) 835 eor x5,x5,x26,ror#34 836 add x25,x25,x17 // h+=Ch(e,f,g) 837 and x28,x28,x19 // (b^c)&=(a^b) 838 eor x3,x3,x12,ror#61 839 eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) 840 add x25,x25,x16 // h+=Sigma1(e) 841 eor x28,x28,x27 // Maj(a,b,c) 842 eor x17,x5,x26,ror#39 // Sigma0(a) 843 eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) 844 add x14,x14,x7 845 add x21,x21,x25 // d+=h 846 add x25,x25,x28 // h+=Maj(a,b,c) 847 ldr x28,[x30],#8 // *K++, x19 in next round 848 add x14,x14,x4 849 add x25,x25,x17 // h+=Sigma0(a) 850 add x14,x14,x3 851 ldr x3,[sp,#0] 852 str x6,[sp,#24] 853 ror x16,x21,#14 854 add x24,x24,x28 // h+=K[i] 855 ror x5,x0,#1 856 and x17,x22,x21 857 ror x4,x13,#19 858 bic x28,x23,x21 859 ror x6,x25,#28 860 add x24,x24,x14 // h+=X[i] 861 eor x16,x16,x21,ror#18 862 eor x5,x5,x0,ror#8 863 orr x17,x17,x28 // Ch(e,f,g) 864 eor x28,x25,x26 // a^b, b^c in next round 865 eor x16,x16,x21,ror#41 // Sigma1(e) 866 eor x6,x6,x25,ror#34 867 add x24,x24,x17 // h+=Ch(e,f,g) 868 and x19,x19,x28 // (b^c)&=(a^b) 869 eor x4,x4,x13,ror#61 870 eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) 871 add x24,x24,x16 // h+=Sigma1(e) 872 eor x19,x19,x26 // Maj(a,b,c) 873 eor x17,x6,x25,ror#39 // Sigma0(a) 874 eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) 875 add x15,x15,x8 876 add x20,x20,x24 // d+=h 877 add x24,x24,x19 // h+=Maj(a,b,c) 878 ldr x19,[x30],#8 // *K++, x28 in next round 879 add x15,x15,x5 880 add x24,x24,x17 // h+=Sigma0(a) 881 add x15,x15,x4 882 ldr x4,[sp,#8] 883 str x7,[sp,#0] 884 ror x16,x20,#14 885 add x23,x23,x19 // h+=K[i] 886 ror x6,x1,#1 887 and x17,x21,x20 888 ror x5,x14,#19 889 bic x19,x22,x20 890 ror x7,x24,#28 891 add x23,x23,x15 // h+=X[i] 892 eor x16,x16,x20,ror#18 893 eor x6,x6,x1,ror#8 894 orr x17,x17,x19 // Ch(e,f,g) 895 eor x19,x24,x25 // a^b, b^c in next round 896 eor x16,x16,x20,ror#41 // Sigma1(e) 897 eor x7,x7,x24,ror#34 898 add x23,x23,x17 // h+=Ch(e,f,g) 899 and x28,x28,x19 // (b^c)&=(a^b) 900 eor x5,x5,x14,ror#61 901 eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) 902 add x23,x23,x16 // h+=Sigma1(e) 903 eor x28,x28,x25 // Maj(a,b,c) 904 eor x17,x7,x24,ror#39 // Sigma0(a) 905 eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) 906 add x0,x0,x9 907 add x27,x27,x23 // d+=h 908 add x23,x23,x28 // h+=Maj(a,b,c) 909 ldr x28,[x30],#8 // *K++, x19 in next round 910 add x0,x0,x6 911 add x23,x23,x17 // h+=Sigma0(a) 912 add x0,x0,x5 913 ldr x5,[sp,#16] 914 str x8,[sp,#8] 915 ror x16,x27,#14 916 add x22,x22,x28 // h+=K[i] 917 ror x7,x2,#1 918 and x17,x20,x27 919 ror x6,x15,#19 920 bic x28,x21,x27 921 ror x8,x23,#28 922 add x22,x22,x0 // h+=X[i] 923 eor x16,x16,x27,ror#18 924 eor x7,x7,x2,ror#8 925 orr x17,x17,x28 // Ch(e,f,g) 926 eor x28,x23,x24 // a^b, b^c in next round 927 eor x16,x16,x27,ror#41 // Sigma1(e) 928 eor x8,x8,x23,ror#34 929 add x22,x22,x17 // h+=Ch(e,f,g) 930 and x19,x19,x28 // (b^c)&=(a^b) 931 eor x6,x6,x15,ror#61 932 eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) 933 add x22,x22,x16 // h+=Sigma1(e) 934 eor x19,x19,x24 // Maj(a,b,c) 935 eor x17,x8,x23,ror#39 // Sigma0(a) 936 eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) 937 add x1,x1,x10 938 add x26,x26,x22 // d+=h 939 add x22,x22,x19 // h+=Maj(a,b,c) 940 ldr x19,[x30],#8 // *K++, x28 in next round 941 add x1,x1,x7 942 add x22,x22,x17 // h+=Sigma0(a) 943 add x1,x1,x6 944 ldr x6,[sp,#24] 945 str x9,[sp,#16] 946 ror x16,x26,#14 947 add x21,x21,x19 // h+=K[i] 948 ror x8,x3,#1 949 and x17,x27,x26 950 ror x7,x0,#19 951 bic x19,x20,x26 952 ror x9,x22,#28 953 add x21,x21,x1 // h+=X[i] 954 eor x16,x16,x26,ror#18 955 eor x8,x8,x3,ror#8 956 orr x17,x17,x19 // Ch(e,f,g) 957 eor x19,x22,x23 // a^b, b^c in next round 958 eor x16,x16,x26,ror#41 // Sigma1(e) 959 eor x9,x9,x22,ror#34 960 add x21,x21,x17 // h+=Ch(e,f,g) 961 and x28,x28,x19 // (b^c)&=(a^b) 962 eor x7,x7,x0,ror#61 963 eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) 964 add x21,x21,x16 // h+=Sigma1(e) 965 eor x28,x28,x23 // Maj(a,b,c) 966 eor x17,x9,x22,ror#39 // Sigma0(a) 967 eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) 968 add x2,x2,x11 969 add x25,x25,x21 // d+=h 970 add x21,x21,x28 // h+=Maj(a,b,c) 971 ldr x28,[x30],#8 // *K++, x19 in next round 972 add x2,x2,x8 973 add x21,x21,x17 // h+=Sigma0(a) 974 add x2,x2,x7 975 ldr x7,[sp,#0] 976 str x10,[sp,#24] 977 ror x16,x25,#14 978 add x20,x20,x28 // h+=K[i] 979 ror x9,x4,#1 980 and x17,x26,x25 981 ror x8,x1,#19 982 bic x28,x27,x25 983 ror x10,x21,#28 984 add x20,x20,x2 // h+=X[i] 985 eor x16,x16,x25,ror#18 986 eor x9,x9,x4,ror#8 987 orr x17,x17,x28 // Ch(e,f,g) 988 eor x28,x21,x22 // a^b, b^c in next round 989 eor x16,x16,x25,ror#41 // Sigma1(e) 990 eor x10,x10,x21,ror#34 991 add x20,x20,x17 // h+=Ch(e,f,g) 992 and x19,x19,x28 // (b^c)&=(a^b) 993 eor x8,x8,x1,ror#61 994 eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) 995 add x20,x20,x16 // h+=Sigma1(e) 996 eor x19,x19,x22 // Maj(a,b,c) 997 eor x17,x10,x21,ror#39 // Sigma0(a) 998 eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) 999 add x3,x3,x12 1000 add x24,x24,x20 // d+=h 1001 add x20,x20,x19 // h+=Maj(a,b,c) 1002 ldr x19,[x30],#8 // *K++, x28 in next round 1003 add x3,x3,x9 1004 add x20,x20,x17 // h+=Sigma0(a) 1005 add x3,x3,x8 1006 cbnz x19,.Loop_16_xx 1007 1008 ldp x0,x2,[x29,#96] 1009 ldr x1,[x29,#112] 1010 sub x30,x30,#648 // rewind 1011 1012 ldp x3,x4,[x0] 1013 ldp x5,x6,[x0,#2*8] 1014 add x1,x1,#14*8 // advance input pointer 1015 ldp x7,x8,[x0,#4*8] 1016 add x20,x20,x3 1017 ldp x9,x10,[x0,#6*8] 1018 add x21,x21,x4 1019 add x22,x22,x5 1020 add x23,x23,x6 1021 stp x20,x21,[x0] 1022 add x24,x24,x7 1023 add x25,x25,x8 1024 stp x22,x23,[x0,#2*8] 1025 add x26,x26,x9 1026 add x27,x27,x10 1027 cmp x1,x2 1028 stp x24,x25,[x0,#4*8] 1029 stp x26,x27,[x0,#6*8] 1030 b.ne .Loop 1031 1032 ldp x19,x20,[x29,#16] 1033 add sp,sp,#4*8 1034 ldp x21,x22,[x29,#32] 1035 ldp x23,x24,[x29,#48] 1036 ldp x25,x26,[x29,#64] 1037 ldp x27,x28,[x29,#80] 1038 ldp x29,x30,[sp],#128 1039.inst 0xd50323bf // autiasp 1040 ret 1041.size sha512_block_data_order,.-sha512_block_data_order 1042 1043.align 6 1044.type .LK512,%object 1045.LK512: 1046.quad 0x428a2f98d728ae22,0x7137449123ef65cd 1047.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 1048.quad 0x3956c25bf348b538,0x59f111f1b605d019 1049.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 1050.quad 0xd807aa98a3030242,0x12835b0145706fbe 1051.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 1052.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 1053.quad 0x9bdc06a725c71235,0xc19bf174cf692694 1054.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 1055.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 1056.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 1057.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 1058.quad 0x983e5152ee66dfab,0xa831c66d2db43210 1059.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 1060.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 1061.quad 0x06ca6351e003826f,0x142929670a0e6e70 1062.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 1063.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 1064.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 1065.quad 0x81c2c92e47edaee6,0x92722c851482353b 1066.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 1067.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 1068.quad 0xd192e819d6ef5218,0xd69906245565a910 1069.quad 0xf40e35855771202a,0x106aa07032bbd1b8 1070.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 1071.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 1072.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 1073.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 1074.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 1075.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 1076.quad 0x90befffa23631e28,0xa4506cebde82bde9 1077.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 1078.quad 0xca273eceea26619c,0xd186b8c721c0c207 1079.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 1080.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 1081.quad 0x113f9804bef90dae,0x1b710b35131c471b 1082.quad 0x28db77f523047d84,0x32caab7b40c72493 1083.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 1084.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 1085.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 1086.quad 0 // terminator 1087.size .LK512,.-.LK512 1088#ifndef __KERNEL__ 1089.align 3 1090.LOPENSSL_armcap_P: 1091# ifdef __ILP32__ 1092.long OPENSSL_armcap_P-. 1093# else 1094.quad OPENSSL_armcap_P-. 1095# endif 1096#endif 1097.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1098.align 2 1099.align 2 1100#ifndef __KERNEL__ 1101.type sha512_block_armv8,%function 1102.align 6 1103sha512_block_armv8: 1104.Lv8_entry: 1105 stp x29,x30,[sp,#-16]! 1106 add x29,sp,#0 1107 1108 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input 1109 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1110 1111 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context 1112 adr x3,.LK512 1113 1114 rev64 v16.16b,v16.16b 1115 rev64 v17.16b,v17.16b 1116 rev64 v18.16b,v18.16b 1117 rev64 v19.16b,v19.16b 1118 rev64 v20.16b,v20.16b 1119 rev64 v21.16b,v21.16b 1120 rev64 v22.16b,v22.16b 1121 rev64 v23.16b,v23.16b 1122 b .Loop_hw 1123 1124.align 4 1125.Loop_hw: 1126 ld1 {v24.2d},[x3],#16 1127 subs x2,x2,#1 1128 sub x4,x1,#128 1129 orr v26.16b,v0.16b,v0.16b // offload 1130 orr v27.16b,v1.16b,v1.16b 1131 orr v28.16b,v2.16b,v2.16b 1132 orr v29.16b,v3.16b,v3.16b 1133 csel x1,x1,x4,ne // conditional rewind 1134 add v24.2d,v24.2d,v16.2d 1135 ld1 {v25.2d},[x3],#16 1136 ext v24.16b,v24.16b,v24.16b,#8 1137 ext v5.16b,v2.16b,v3.16b,#8 1138 ext v6.16b,v1.16b,v2.16b,#8 1139 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1140.inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1141 ext v7.16b,v20.16b,v21.16b,#8 1142.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1143.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1144 add v4.2d,v1.2d,v3.2d // "D + T1" 1145.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1146 add v25.2d,v25.2d,v17.2d 1147 ld1 {v24.2d},[x3],#16 1148 ext v25.16b,v25.16b,v25.16b,#8 1149 ext v5.16b,v4.16b,v2.16b,#8 1150 ext v6.16b,v0.16b,v4.16b,#8 1151 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1152.inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1153 ext v7.16b,v21.16b,v22.16b,#8 1154.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1155.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1156 add v1.2d,v0.2d,v2.2d // "D + T1" 1157.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1158 add v24.2d,v24.2d,v18.2d 1159 ld1 {v25.2d},[x3],#16 1160 ext v24.16b,v24.16b,v24.16b,#8 1161 ext v5.16b,v1.16b,v4.16b,#8 1162 ext v6.16b,v3.16b,v1.16b,#8 1163 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1164.inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1165 ext v7.16b,v22.16b,v23.16b,#8 1166.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1167.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1168 add v0.2d,v3.2d,v4.2d // "D + T1" 1169.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1170 add v25.2d,v25.2d,v19.2d 1171 ld1 {v24.2d},[x3],#16 1172 ext v25.16b,v25.16b,v25.16b,#8 1173 ext v5.16b,v0.16b,v1.16b,#8 1174 ext v6.16b,v2.16b,v0.16b,#8 1175 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1176.inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1177 ext v7.16b,v23.16b,v16.16b,#8 1178.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1179.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1180 add v3.2d,v2.2d,v1.2d // "D + T1" 1181.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1182 add v24.2d,v24.2d,v20.2d 1183 ld1 {v25.2d},[x3],#16 1184 ext v24.16b,v24.16b,v24.16b,#8 1185 ext v5.16b,v3.16b,v0.16b,#8 1186 ext v6.16b,v4.16b,v3.16b,#8 1187 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1188.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1189 ext v7.16b,v16.16b,v17.16b,#8 1190.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1191.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1192 add v2.2d,v4.2d,v0.2d // "D + T1" 1193.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1194 add v25.2d,v25.2d,v21.2d 1195 ld1 {v24.2d},[x3],#16 1196 ext v25.16b,v25.16b,v25.16b,#8 1197 ext v5.16b,v2.16b,v3.16b,#8 1198 ext v6.16b,v1.16b,v2.16b,#8 1199 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1200.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1201 ext v7.16b,v17.16b,v18.16b,#8 1202.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1203.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1204 add v4.2d,v1.2d,v3.2d // "D + T1" 1205.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1206 add v24.2d,v24.2d,v22.2d 1207 ld1 {v25.2d},[x3],#16 1208 ext v24.16b,v24.16b,v24.16b,#8 1209 ext v5.16b,v4.16b,v2.16b,#8 1210 ext v6.16b,v0.16b,v4.16b,#8 1211 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1212.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1213 ext v7.16b,v18.16b,v19.16b,#8 1214.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1215.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1216 add v1.2d,v0.2d,v2.2d // "D + T1" 1217.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1218 add v25.2d,v25.2d,v23.2d 1219 ld1 {v24.2d},[x3],#16 1220 ext v25.16b,v25.16b,v25.16b,#8 1221 ext v5.16b,v1.16b,v4.16b,#8 1222 ext v6.16b,v3.16b,v1.16b,#8 1223 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1224.inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1225 ext v7.16b,v19.16b,v20.16b,#8 1226.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1227.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1228 add v0.2d,v3.2d,v4.2d // "D + T1" 1229.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1230 add v24.2d,v24.2d,v16.2d 1231 ld1 {v25.2d},[x3],#16 1232 ext v24.16b,v24.16b,v24.16b,#8 1233 ext v5.16b,v0.16b,v1.16b,#8 1234 ext v6.16b,v2.16b,v0.16b,#8 1235 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1236.inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1237 ext v7.16b,v20.16b,v21.16b,#8 1238.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1239.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1240 add v3.2d,v2.2d,v1.2d // "D + T1" 1241.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1242 add v25.2d,v25.2d,v17.2d 1243 ld1 {v24.2d},[x3],#16 1244 ext v25.16b,v25.16b,v25.16b,#8 1245 ext v5.16b,v3.16b,v0.16b,#8 1246 ext v6.16b,v4.16b,v3.16b,#8 1247 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1248.inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1249 ext v7.16b,v21.16b,v22.16b,#8 1250.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1251.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1252 add v2.2d,v4.2d,v0.2d // "D + T1" 1253.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1254 add v24.2d,v24.2d,v18.2d 1255 ld1 {v25.2d},[x3],#16 1256 ext v24.16b,v24.16b,v24.16b,#8 1257 ext v5.16b,v2.16b,v3.16b,#8 1258 ext v6.16b,v1.16b,v2.16b,#8 1259 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1260.inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1261 ext v7.16b,v22.16b,v23.16b,#8 1262.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1263.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1264 add v4.2d,v1.2d,v3.2d // "D + T1" 1265.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1266 add v25.2d,v25.2d,v19.2d 1267 ld1 {v24.2d},[x3],#16 1268 ext v25.16b,v25.16b,v25.16b,#8 1269 ext v5.16b,v4.16b,v2.16b,#8 1270 ext v6.16b,v0.16b,v4.16b,#8 1271 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1272.inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1273 ext v7.16b,v23.16b,v16.16b,#8 1274.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1275.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1276 add v1.2d,v0.2d,v2.2d // "D + T1" 1277.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1278 add v24.2d,v24.2d,v20.2d 1279 ld1 {v25.2d},[x3],#16 1280 ext v24.16b,v24.16b,v24.16b,#8 1281 ext v5.16b,v1.16b,v4.16b,#8 1282 ext v6.16b,v3.16b,v1.16b,#8 1283 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1284.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1285 ext v7.16b,v16.16b,v17.16b,#8 1286.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1287.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1288 add v0.2d,v3.2d,v4.2d // "D + T1" 1289.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1290 add v25.2d,v25.2d,v21.2d 1291 ld1 {v24.2d},[x3],#16 1292 ext v25.16b,v25.16b,v25.16b,#8 1293 ext v5.16b,v0.16b,v1.16b,#8 1294 ext v6.16b,v2.16b,v0.16b,#8 1295 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1296.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1297 ext v7.16b,v17.16b,v18.16b,#8 1298.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1299.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1300 add v3.2d,v2.2d,v1.2d // "D + T1" 1301.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1302 add v24.2d,v24.2d,v22.2d 1303 ld1 {v25.2d},[x3],#16 1304 ext v24.16b,v24.16b,v24.16b,#8 1305 ext v5.16b,v3.16b,v0.16b,#8 1306 ext v6.16b,v4.16b,v3.16b,#8 1307 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1308.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1309 ext v7.16b,v18.16b,v19.16b,#8 1310.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1311.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1312 add v2.2d,v4.2d,v0.2d // "D + T1" 1313.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1314 add v25.2d,v25.2d,v23.2d 1315 ld1 {v24.2d},[x3],#16 1316 ext v25.16b,v25.16b,v25.16b,#8 1317 ext v5.16b,v2.16b,v3.16b,#8 1318 ext v6.16b,v1.16b,v2.16b,#8 1319 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1320.inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1321 ext v7.16b,v19.16b,v20.16b,#8 1322.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1323.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1324 add v4.2d,v1.2d,v3.2d // "D + T1" 1325.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1326 add v24.2d,v24.2d,v16.2d 1327 ld1 {v25.2d},[x3],#16 1328 ext v24.16b,v24.16b,v24.16b,#8 1329 ext v5.16b,v4.16b,v2.16b,#8 1330 ext v6.16b,v0.16b,v4.16b,#8 1331 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1332.inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1333 ext v7.16b,v20.16b,v21.16b,#8 1334.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1335.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1336 add v1.2d,v0.2d,v2.2d // "D + T1" 1337.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1338 add v25.2d,v25.2d,v17.2d 1339 ld1 {v24.2d},[x3],#16 1340 ext v25.16b,v25.16b,v25.16b,#8 1341 ext v5.16b,v1.16b,v4.16b,#8 1342 ext v6.16b,v3.16b,v1.16b,#8 1343 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1344.inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1345 ext v7.16b,v21.16b,v22.16b,#8 1346.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1347.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1348 add v0.2d,v3.2d,v4.2d // "D + T1" 1349.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1350 add v24.2d,v24.2d,v18.2d 1351 ld1 {v25.2d},[x3],#16 1352 ext v24.16b,v24.16b,v24.16b,#8 1353 ext v5.16b,v0.16b,v1.16b,#8 1354 ext v6.16b,v2.16b,v0.16b,#8 1355 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1356.inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1357 ext v7.16b,v22.16b,v23.16b,#8 1358.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1359.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1360 add v3.2d,v2.2d,v1.2d // "D + T1" 1361.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1362 add v25.2d,v25.2d,v19.2d 1363 ld1 {v24.2d},[x3],#16 1364 ext v25.16b,v25.16b,v25.16b,#8 1365 ext v5.16b,v3.16b,v0.16b,#8 1366 ext v6.16b,v4.16b,v3.16b,#8 1367 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1368.inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1369 ext v7.16b,v23.16b,v16.16b,#8 1370.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1371.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1372 add v2.2d,v4.2d,v0.2d // "D + T1" 1373.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1374 add v24.2d,v24.2d,v20.2d 1375 ld1 {v25.2d},[x3],#16 1376 ext v24.16b,v24.16b,v24.16b,#8 1377 ext v5.16b,v2.16b,v3.16b,#8 1378 ext v6.16b,v1.16b,v2.16b,#8 1379 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1380.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1381 ext v7.16b,v16.16b,v17.16b,#8 1382.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1383.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1384 add v4.2d,v1.2d,v3.2d // "D + T1" 1385.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1386 add v25.2d,v25.2d,v21.2d 1387 ld1 {v24.2d},[x3],#16 1388 ext v25.16b,v25.16b,v25.16b,#8 1389 ext v5.16b,v4.16b,v2.16b,#8 1390 ext v6.16b,v0.16b,v4.16b,#8 1391 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1392.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1393 ext v7.16b,v17.16b,v18.16b,#8 1394.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1395.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1396 add v1.2d,v0.2d,v2.2d // "D + T1" 1397.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1398 add v24.2d,v24.2d,v22.2d 1399 ld1 {v25.2d},[x3],#16 1400 ext v24.16b,v24.16b,v24.16b,#8 1401 ext v5.16b,v1.16b,v4.16b,#8 1402 ext v6.16b,v3.16b,v1.16b,#8 1403 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1404.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1405 ext v7.16b,v18.16b,v19.16b,#8 1406.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1407.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1408 add v0.2d,v3.2d,v4.2d // "D + T1" 1409.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1410 add v25.2d,v25.2d,v23.2d 1411 ld1 {v24.2d},[x3],#16 1412 ext v25.16b,v25.16b,v25.16b,#8 1413 ext v5.16b,v0.16b,v1.16b,#8 1414 ext v6.16b,v2.16b,v0.16b,#8 1415 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1416.inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1417 ext v7.16b,v19.16b,v20.16b,#8 1418.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1419.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1420 add v3.2d,v2.2d,v1.2d // "D + T1" 1421.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1422 add v24.2d,v24.2d,v16.2d 1423 ld1 {v25.2d},[x3],#16 1424 ext v24.16b,v24.16b,v24.16b,#8 1425 ext v5.16b,v3.16b,v0.16b,#8 1426 ext v6.16b,v4.16b,v3.16b,#8 1427 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1428.inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1429 ext v7.16b,v20.16b,v21.16b,#8 1430.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1431.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1432 add v2.2d,v4.2d,v0.2d // "D + T1" 1433.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1434 add v25.2d,v25.2d,v17.2d 1435 ld1 {v24.2d},[x3],#16 1436 ext v25.16b,v25.16b,v25.16b,#8 1437 ext v5.16b,v2.16b,v3.16b,#8 1438 ext v6.16b,v1.16b,v2.16b,#8 1439 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1440.inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1441 ext v7.16b,v21.16b,v22.16b,#8 1442.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1443.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1444 add v4.2d,v1.2d,v3.2d // "D + T1" 1445.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1446 add v24.2d,v24.2d,v18.2d 1447 ld1 {v25.2d},[x3],#16 1448 ext v24.16b,v24.16b,v24.16b,#8 1449 ext v5.16b,v4.16b,v2.16b,#8 1450 ext v6.16b,v0.16b,v4.16b,#8 1451 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1452.inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1453 ext v7.16b,v22.16b,v23.16b,#8 1454.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1455.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1456 add v1.2d,v0.2d,v2.2d // "D + T1" 1457.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1458 add v25.2d,v25.2d,v19.2d 1459 ld1 {v24.2d},[x3],#16 1460 ext v25.16b,v25.16b,v25.16b,#8 1461 ext v5.16b,v1.16b,v4.16b,#8 1462 ext v6.16b,v3.16b,v1.16b,#8 1463 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1464.inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1465 ext v7.16b,v23.16b,v16.16b,#8 1466.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1467.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1468 add v0.2d,v3.2d,v4.2d // "D + T1" 1469.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1470 add v24.2d,v24.2d,v20.2d 1471 ld1 {v25.2d},[x3],#16 1472 ext v24.16b,v24.16b,v24.16b,#8 1473 ext v5.16b,v0.16b,v1.16b,#8 1474 ext v6.16b,v2.16b,v0.16b,#8 1475 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1476.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1477 ext v7.16b,v16.16b,v17.16b,#8 1478.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1479.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1480 add v3.2d,v2.2d,v1.2d // "D + T1" 1481.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1482 add v25.2d,v25.2d,v21.2d 1483 ld1 {v24.2d},[x3],#16 1484 ext v25.16b,v25.16b,v25.16b,#8 1485 ext v5.16b,v3.16b,v0.16b,#8 1486 ext v6.16b,v4.16b,v3.16b,#8 1487 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1488.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1489 ext v7.16b,v17.16b,v18.16b,#8 1490.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1491.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1492 add v2.2d,v4.2d,v0.2d // "D + T1" 1493.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1494 add v24.2d,v24.2d,v22.2d 1495 ld1 {v25.2d},[x3],#16 1496 ext v24.16b,v24.16b,v24.16b,#8 1497 ext v5.16b,v2.16b,v3.16b,#8 1498 ext v6.16b,v1.16b,v2.16b,#8 1499 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1500.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1501 ext v7.16b,v18.16b,v19.16b,#8 1502.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1503.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1504 add v4.2d,v1.2d,v3.2d // "D + T1" 1505.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1506 add v25.2d,v25.2d,v23.2d 1507 ld1 {v24.2d},[x3],#16 1508 ext v25.16b,v25.16b,v25.16b,#8 1509 ext v5.16b,v4.16b,v2.16b,#8 1510 ext v6.16b,v0.16b,v4.16b,#8 1511 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1512.inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1513 ext v7.16b,v19.16b,v20.16b,#8 1514.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1515.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1516 add v1.2d,v0.2d,v2.2d // "D + T1" 1517.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1518 ld1 {v25.2d},[x3],#16 1519 add v24.2d,v24.2d,v16.2d 1520 ld1 {v16.16b},[x1],#16 // load next input 1521 ext v24.16b,v24.16b,v24.16b,#8 1522 ext v5.16b,v1.16b,v4.16b,#8 1523 ext v6.16b,v3.16b,v1.16b,#8 1524 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1525.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1526 rev64 v16.16b,v16.16b 1527 add v0.2d,v3.2d,v4.2d // "D + T1" 1528.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1529 ld1 {v24.2d},[x3],#16 1530 add v25.2d,v25.2d,v17.2d 1531 ld1 {v17.16b},[x1],#16 // load next input 1532 ext v25.16b,v25.16b,v25.16b,#8 1533 ext v5.16b,v0.16b,v1.16b,#8 1534 ext v6.16b,v2.16b,v0.16b,#8 1535 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1536.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1537 rev64 v17.16b,v17.16b 1538 add v3.2d,v2.2d,v1.2d // "D + T1" 1539.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1540 ld1 {v25.2d},[x3],#16 1541 add v24.2d,v24.2d,v18.2d 1542 ld1 {v18.16b},[x1],#16 // load next input 1543 ext v24.16b,v24.16b,v24.16b,#8 1544 ext v5.16b,v3.16b,v0.16b,#8 1545 ext v6.16b,v4.16b,v3.16b,#8 1546 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1547.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1548 rev64 v18.16b,v18.16b 1549 add v2.2d,v4.2d,v0.2d // "D + T1" 1550.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1551 ld1 {v24.2d},[x3],#16 1552 add v25.2d,v25.2d,v19.2d 1553 ld1 {v19.16b},[x1],#16 // load next input 1554 ext v25.16b,v25.16b,v25.16b,#8 1555 ext v5.16b,v2.16b,v3.16b,#8 1556 ext v6.16b,v1.16b,v2.16b,#8 1557 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1558.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1559 rev64 v19.16b,v19.16b 1560 add v4.2d,v1.2d,v3.2d // "D + T1" 1561.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1562 ld1 {v25.2d},[x3],#16 1563 add v24.2d,v24.2d,v20.2d 1564 ld1 {v20.16b},[x1],#16 // load next input 1565 ext v24.16b,v24.16b,v24.16b,#8 1566 ext v5.16b,v4.16b,v2.16b,#8 1567 ext v6.16b,v0.16b,v4.16b,#8 1568 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1569.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1570 rev64 v20.16b,v20.16b 1571 add v1.2d,v0.2d,v2.2d // "D + T1" 1572.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1573 ld1 {v24.2d},[x3],#16 1574 add v25.2d,v25.2d,v21.2d 1575 ld1 {v21.16b},[x1],#16 // load next input 1576 ext v25.16b,v25.16b,v25.16b,#8 1577 ext v5.16b,v1.16b,v4.16b,#8 1578 ext v6.16b,v3.16b,v1.16b,#8 1579 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1580.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1581 rev64 v21.16b,v21.16b 1582 add v0.2d,v3.2d,v4.2d // "D + T1" 1583.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1584 ld1 {v25.2d},[x3],#16 1585 add v24.2d,v24.2d,v22.2d 1586 ld1 {v22.16b},[x1],#16 // load next input 1587 ext v24.16b,v24.16b,v24.16b,#8 1588 ext v5.16b,v0.16b,v1.16b,#8 1589 ext v6.16b,v2.16b,v0.16b,#8 1590 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1591.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1592 rev64 v22.16b,v22.16b 1593 add v3.2d,v2.2d,v1.2d // "D + T1" 1594.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1595 sub x3,x3,#80*8 // rewind 1596 add v25.2d,v25.2d,v23.2d 1597 ld1 {v23.16b},[x1],#16 // load next input 1598 ext v25.16b,v25.16b,v25.16b,#8 1599 ext v5.16b,v3.16b,v0.16b,#8 1600 ext v6.16b,v4.16b,v3.16b,#8 1601 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1602.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1603 rev64 v23.16b,v23.16b 1604 add v2.2d,v4.2d,v0.2d // "D + T1" 1605.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1606 add v0.2d,v0.2d,v26.2d // accumulate 1607 add v1.2d,v1.2d,v27.2d 1608 add v2.2d,v2.2d,v28.2d 1609 add v3.2d,v3.2d,v29.2d 1610 1611 cbnz x2,.Loop_hw 1612 1613 st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context 1614 1615 ldr x29,[sp],#16 1616 ret 1617.size sha512_block_armv8,.-sha512_block_armv8 1618#endif 1619