1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16 17#include "hitls_build.h" 18#ifdef HITLS_CRYPTO_SM3 19 20#include "crypt_arm.h" 21.arch armv8-a+crypto 22 23// The first 16 of the compression function, w13 is Tj. 24.macro first16 A B C D E F G H W1 W2 25 ror w13, w13, #31 26 ror w10, \A, #20 27 add w9, \E, w10 28 eor w12, \E, \F 29 ror \F, \F, #13 30 eor w12, w12, \G 31 add w12, w12, \H 32 add w9, w9, w13 33 ror w9, w9, #25 34 add w12, w12, w9 35 eor w10, w10, w9 36 add w12, w12, \W1 37 eor \H, w12, w12, ror #23 38 ror w9, w12, #15 39 eor \H, \H, w9 40 eor w11, \A, \B 41 ror \B, \B, #23 42 eor w11, w11, \C 43 add w11, w11, \D 44 add w11, w11, w10 45 eor w9, \W1, \W2 46 add \D, w11, w9 47 .endm 48 49// Compress the last 48 of the function, w13 is Tj 50.macro second48 A B C D E F G H W1 W2 51 ror w13, w13, #31 52 orr w11, \B, \C 53 eor w12, \F, \G 54 ror \F, \F, #13 55 ror w10, \A, #20 56 add w9, w10, \E 57 and w14, \A, w11 58 and w12, w12, \E 59 eor w12, w12, \G 60 add w12, w12, \H 61 add w9, w9, w13 62 ror w9, w9, #25 63 add w12, w12, w9 64 eor w10, w10, w9 65 add w12, w12, \W1 66 and w11, \B, \C 67 ror \B, \B, #23 68 orr w11, w11, w14 69 eor w9, \W1, \W2 70 add w11, w11, \D 71 add w11, w11, w10 72 add \D, w11, w9 73 eor \H, w12, w12, ror #23 74 ror w9, w12, #15 75 eor \H, \H, w9 76 .endm 77 78// void SM3_CompressAsm(uint32_t state[8], const uint8_t *data, uint32_t blockCnt); 79.globl SM3_CompressAsm 80.type SM3_CompressAsm, %function 81.align 4 82SM3_CompressAsm: 83AARCH64_PACIASP 84 sub sp, sp, 128 85 stp x19, x20, [sp] 86 stp x21, x22, [sp, #16] 87 stp x23, x24, [sp, #32] 88 stp x25, x26, [sp, #48] 89 // According to the calling convention, this function needs to be saved. 90 stp d8, d9, [sp, #64] 91 stp d10, d11, [sp, #80] 92 stp d12, d13, [sp, #96] 93 stp d14, d15, [sp, #112] 94 95 sub sp, sp, 64 96 mov x25, sp 97 sub sp, sp, 64 98 mov x26, sp 99 100 mov x22, x0 // x22: state 101 mov x23, x1 // x23: data 102 mov w24, w2 // x24: blockCnt 103 104 // w0-w7: ABCDEFGH word register in"SM3 cryptographic hash algorithm" 105 ldp w0, w1, [x22] 106 ldp w2, w3, [x22, #8] 107 ldp w4, w5, [x22, #16] 108 ldp w6, w7, [x22, #24] 109 110 prfm pldl1keep, [x23, #64] 111 blocksloop_1: 112 subs w24, w24, #1 113 bmi end 114 // Due to the SM3 feature, only three messages can be extended in parallel. 115 // You need to use ext to ensure that the data meets the requirements for calculation. 116 // To reduce the delay, the message expansion is calculated together with the compression function, 117 // and the compression function is calculated three times for every three Ws. 118 119 // v0-v3 message group w0-w15 120 ld1 {v0.4s-v3.4s}, [x23] 121#ifndef HITLS_BIG_ENDIAN 122 rev32 v0.16B, v0.16B 123 rev32 v1.16B, v1.16B 124 rev32 v2.16B, v2.16B 125 rev32 v3.16B, v3.16B 126#endif 127 128 ldp w15, w20, [x23] 129 ldp w19, w21, [x23, #16] 130#ifndef HITLS_BIG_ENDIAN 131 rev w15, w15 132 rev w19, w19 133 rev w20, w20 134 rev w21, w21 135#endif 136 137 ext v24.16b, v3.16b, v3.16b, #4 // 13, 14, 15 138 ext v25.16b, v0.16b, v1.16b, #12 // 3, 4, 5 139 ext v23.16b, v1.16b, v2.16b, #12 // 7, 8, 9 140 ext v26.16b, v2.16b, v3.16b, #8 // 10, 11, 12 141 eor v27.16b, v0.16b, v23.16b 142 // w13: constant Tj , 0 <= j <= 16 143 mov w13, #0x228c 144 movk w13, #0xbce6, lsl #16 145 146 // Message grouping: Wj−3 ≪ 15, Wj−13 ≪ 7 147 shl v21.4s, v24.4s, #15 148 shl v22.4s, v25.4s, #7 149 sri v21.4s, v24.4s, #17 // 13, 14, 15<<<15 150 sri v22.4s, v25.4s, #25 // 3, 4, 5<<<7 151 first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 152 eor v27.16b, v21.16b, v27.16b 153 eor v28.16b, v22.16b, v26.16b 154 first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 155 // permutation function P1: X ^ (X ≪ 15) ^ (X ≪ 23) 156 shl v29.4s, v27.4s, #15 157 shl v30.4s, v27.4s, #23 158 sri v29.4s, v27.4s, #17 159 sri v30.4s, v27.4s, #9 160 eor v27.16b, v27.16b, v29.16b 161 ldp w15, w20, [x23, #8] 162 ldp w19, w21, [x23, #24] 163#ifndef HITLS_BIG_ENDIAN 164 rev w15, w15 165 rev w19, w19 166 rev w20, w20 167 rev w21, w21 168#endif 169 eor v27.16b, v27.16b, v30.16b 170 first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 171 eor v4.16b, v27.16b, v28.16b 172 173 // 2:19, 20, 21 174 ext v23.16b, v1.16b, v2.16b, #8 // 6, 7, 8 175 eor v27.16b, v25.16b, v26.16b 176 first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 177 shl v21.4s, v4.4s, #15 178 shl v22.4s, v23.4s, #7 179 sri v21.4s, v4.4s, #17 // 16, 17, 18<<<15 180 sri v22.4s, v23.4s, #25 // 6, 7, 8<<<7 181 ldp w15, w20, [x23, #16] 182 ldp w19, w21, [x23, #32] 183#ifndef HITLS_BIG_ENDIAN 184 rev w15, w15 185 rev w19, w19 186 rev w20, w20 187 rev w21, w21 188#endif 189 eor v27.16b, v21.16b, v27.16b 190 eor v28.16b, v22.16b, v24.16b 191 first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 192 shl v29.4s, v27.4s, #15 193 shl v30.4s, v27.4s, #23 194 sri v29.4s, v27.4s, #17 195 sri v30.4s, v27.4s, #9 196 eor v27.16b, v27.16b, v29.16b 197 first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 198 eor v27.16b, v27.16b, v30.16b 199 mov v4.s[3], v4.s[2] // Due to ext requirements, to fill s[3] 200 eor v5.16b, v27.16b, v28.16b 201 202 // 3:22, 23, 24 203 ext v25.16b, v2.16b, v3.16b, #4 // 9, 10, 11 204 eor v27.16b, v23.16b, v24.16b 205 ldp w15, w20, [x23, #24] 206 ldp w19, w21, [x23, #40] 207#ifndef HITLS_BIG_ENDIAN 208 rev w15, w15 209 rev w19, w19 210 rev w20, w20 211 rev w21, w21 212#endif 213 shl v21.4s, v5.4s, #15 214 shl v22.4s, v25.4s, #7 215 sri v21.4s, v5.4s, #17 // 19, 20, 21<<<15 216 sri v22.4s, v25.4s, #25 // 9, 10, 11<<<7 217 first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 218 eor v27.16b, v21.16b, v27.16b 219 eor v28.16b, v22.16b, v4.16b 220 first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 221 shl v29.4s, v27.4s, #15 222 shl v30.4s, v27.4s, #23 223 sri v29.4s, v27.4s, #17 224 sri v30.4s, v27.4s, #9 225 eor v27.16b, v27.16b, v29.16b 226 ldp w15, w20, [x23, #32] 227 ldp w19, w21, [x23, #48] 228#ifndef HITLS_BIG_ENDIAN 229 rev w15, w15 230 rev w19, w19 231 rev w20, w20 232 rev w21, w21 233#endif 234 first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 235 eor v27.16b, v27.16b, v30.16b 236 mov v5.s[3], v5.s[2] // Due to ext requirements, to fill s[3] 237 eor v6.16b, v27.16b, v28.16b 238 239 // 4:25, 26, 27 240 eor v27.16b, v25.16b, v4.16b 241 shl v21.4s, v6.4s, #15 242 shl v22.4s, v3.4s, #7 243 sri v21.4s, v6.4s, #17 // 22, 23, 24<<<15 244 sri v22.4s, v3.4s, #25 // 12, 13, 14<<<7 245 first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 246 eor v27.16b, v21.16b, v27.16b 247 eor v28.16b, v22.16b, v5.16b 248 ldp w15, w20, [x23, #40] 249 ldp w19, w21, [x23, #56] 250#ifndef HITLS_BIG_ENDIAN 251 rev w15, w15 252 rev w19, w19 253 rev w20, w20 254 rev w21, w21 255#endif 256 shl v29.4s, v27.4s, #15 257 shl v30.4s, v27.4s, #23 258 sri v29.4s, v27.4s, #17 259 sri v30.4s, v27.4s, #9 260 first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 261 eor v27.16b, v27.16b, v29.16b 262 first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 263 eor v27.16b, v27.16b, v30.16b 264 mov v6.s[3], v6.s[2] // Due to ext requirements, to fill s[3] 265 eor v7.16b, v27.16b, v28.16b 266 267 // 5:28, 29, 30 268 ext v23.16b, v3.16b, v4.16b, #12 // 15, 16, 17 269 eor v27.16b, v3.16b, v5.16b 270 st1 {v4.4s-v7.4s}, [x25] // There is a redundant data for every four 32-bit bits of the stored data. 271 // The data needs to be read in a skip manner. 272 shl v21.4s, v7.4s, #15 273 shl v22.4s, v23.4s, #7 274 sri v21.4s, v7.4s, #17 // 25, 26, 27<<<15 275 sri v22.4s, v23.4s, #25 // 15, 16, 17<<<7 276 ldp w15, w20, [x23, #48] 277 ldp w19, w21, [x25] 278#ifndef HITLS_BIG_ENDIAN 279 rev w15, w15 280 rev w20, w20 281#endif 282 first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 283 eor v27.16b, v21.16b, v27.16b 284 eor v28.16b, v22.16b, v6.16b 285 shl v29.4s, v27.4s, #15 286 shl v30.4s, v27.4s, #23 287 sri v29.4s, v27.4s, #17 288 sri v30.4s, v27.4s, #9 289 eor v27.16b, v27.16b, v29.16b 290 first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 291 ldp w15, w20, [x23, #56] 292#ifndef HITLS_BIG_ENDIAN 293 rev w15, w15 294 rev w20, w20 295#endif 296 add x23, x23, #64 297 prfm pldl1keep, [x23, #64] 298 ldr w19, [x25, #8] 299 ldr w21, [x25, #16] 300 eor v27.16b, v27.16b, v30.16b 301 first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 302 mov v7.s[3], v7.s[2] // Due to ext requirements, to fill s[3] 303 eor v8.16b, v27.16b, v28.16b 304 305 // Message extension completed. Continue with the next 48 compression. 306 ext v24.16b, v4.16b, v5.16b, #12 // 18, 19, 20 307 eor v27.16b, v23.16b, v6.16b 308 first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 309 shl v21.4s, v8.4s, #15 310 shl v22.4s, v24.4s, #7 311 sri v21.4s, v8.4s, #17 // 28, 29, 30<<<15 312 sri v22.4s, v24.4s, #25 // 18, 19, 20<<<7 313 ldp w15, w20, [x25] 314 ldp w19, w21, [x25, #20] 315 eor v27.16b, v21.16b, v27.16b 316 eor v28.16b, v22.16b, v7.16b 317 // w13: constant Tj , 17 <= j <= 63 318 mov w13, #0x3d43 319 movk w13, #0xcec5, lsl #16 320 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 321 shl v29.4s, v27.4s, #15 322 shl v30.4s, v27.4s, #23 323 sri v29.4s, v27.4s, #17 324 sri v30.4s, v27.4s, #9 325 eor v27.16b, v27.16b, v29.16b 326 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 327 eor v27.16b, v27.16b, v30.16b 328 mov v8.s[3], v8.s[2] // Due to ext requirements, to fill s[3] 329 eor v9.16b, v27.16b, v28.16b 330 331 // 7:34, 35, 36 332 ext v23.16b, v5.16b, v6.16b, #12 // 21, 22, 23 333 eor v27.16b, v24.16b, v7.16b 334 ldr w15, [x25, #8] 335 ldr w20, [x25, #16] 336 ldp w19, w21, [x25, #32] 337 shl v21.4s, v9.4s, #15 338 shl v22.4s, v23.4s, #7 339 sri v21.4s, v9.4s, #17 // 31, 32, 33<<<15 340 sri v22.4s, v23.4s, #25 // 21, 22, 23<<<7 341 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 342 eor v27.16b, v21.16b, v27.16b 343 eor v28.16b, v22.16b, v8.16b 344 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 345 shl v29.4s, v27.4s, #15 346 shl v30.4s, v27.4s, #23 347 sri v29.4s, v27.4s, #17 348 sri v30.4s, v27.4s, #9 349 eor v27.16b, v27.16b, v29.16b 350 ldp w15, w20, [x25, #20] 351 ldr w19, [x25, #40] 352 ldr w21, [x25, #48] 353 eor v27.16b, v27.16b, v30.16b 354 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 355 mov v9.s[3], v9.s[2] // Due to ext requirements, to fill s[3] 356 eor v10.16b, v27.16b, v28.16b 357 358 // 8:37, 38, 39 359 ext v24.16b, v6.16b, v7.16b, #12 // 24, 25, 26 360 eor v27.16b, v23.16b, v8.16b 361 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 362 shl v21.4s, v10.4s, #15 363 shl v22.4s, v24.4s, #7 364 sri v21.4s, v10.4s, #17 // 34, 35, 36<<<15 365 sri v22.4s, v24.4s, #25 // 24, 25, 26<<<7 366 ldp w15, w20, [x25, #32] 367 ldp w19, w21, [x25, #52] 368 eor v27.16b, v21.16b, v27.16b 369 eor v28.16b, v22.16b, v9.16b 370 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 371 shl v29.4s, v27.4s, #15 372 shl v30.4s, v27.4s, #23 373 sri v29.4s, v27.4s, #17 374 sri v30.4s, v27.4s, #9 375 eor v27.16b, v27.16b, v29.16b 376 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 377 eor v27.16b, v27.16b, v30.16b 378 mov v10.s[3], v10.s[2] // Due to ext requirements, to fill s[3] 379 eor v11.16b, v27.16b, v28.16b 380 381 // 9:40, 41, 42 382 ext v23.16b, v7.16b, v8.16b, #12 // 27, 28, 29 383 eor v27.16b, v24.16b, v9.16b 384 st1 {v8.4s-v11.4s}, [x26] 385 shl v21.4s, v11.4s, #15 386 shl v22.4s, v23.4s, #7 387 sri v21.4s, v11.4s, #17 // 37, 38, 39<<<15 388 sri v22.4s, v23.4s, #25 // 27, 28, 29<<<7 389 ldr w15, [x25, #40] 390 ldr w20, [x25, #48] 391 ldp w19, w21, [x26] 392 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 393 eor v27.16b, v21.16b, v27.16b 394 eor v28.16b, v22.16b, v10.16b 395 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 396 shl v29.4s, v27.4s, #15 397 shl v30.4s, v27.4s, #23 398 sri v29.4s, v27.4s, #17 399 sri v30.4s, v27.4s, #9 400 eor v27.16b, v27.16b, v29.16b 401 ldp w15, w20, [x25, #52] 402 ldr w19, [x26, #8] 403 ldr w21, [x26, #16] 404 eor v27.16b, v27.16b, v30.16b 405 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 406 mov v11.s[3], v11.s[2] // Due to ext requirements, to fill s[3] 407 eor v12.16b, v27.16b, v28.16b 408 409 // 10:43, 44, 45 410 ext v24.16b, v8.16b, v9.16b, #12 // 30, 31, 32 411 eor v27.16b, v23.16b, v10.16b 412 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 413 shl v21.4s, v12.4s, #15 414 shl v22.4s, v24.4s, #7 415 sri v21.4s, v12.4s, #17 // 40, 41, 42<<<15 416 sri v22.4s, v24.4s, #25 // 30, 31, 32<<<7 417 ldp w15, w20, [x26] 418 ldp w19, w21, [x26, #20] 419 eor v27.16b, v21.16b, v27.16b 420 eor v28.16b, v22.16b, v11.16b 421 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 422 shl v29.4s, v27.4s, #15 423 shl v30.4s, v27.4s, #23 424 sri v29.4s, v27.4s, #17 425 sri v30.4s, v27.4s, #9 426 eor v27.16b, v27.16b, v29.16b 427 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 428 eor v27.16b, v27.16b, v30.16b 429 mov v12.s[3], v12.s[2] // Due to ext requirements, to fill s[3] 430 eor v13.16b, v27.16b, v28.16b 431 432 // 11:46, 47, 48 433 ext v23.16b, v9.16b, v10.16b, #12 // 33, 34, 35 434 eor v27.16b, v24.16b, v11.16b 435 ldr w15, [x26, #8] 436 ldr w20, [x26, #16] 437 ldp w19, w21, [x26, #32] 438 shl v21.4s, v13.4s, #15 439 shl v22.4s, v23.4s, #7 440 sri v21.4s, v13.4s, #17 // 43, 44, 45<<<15 441 sri v22.4s, v23.4s, #25 // 33, 34, 35<<<7 442 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 443 eor v27.16b, v21.16b, v27.16b 444 eor v28.16b, v22.16b, v12.16b 445 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 446 shl v29.4s, v27.4s, #15 447 shl v30.4s, v27.4s, #23 448 sri v29.4s, v27.4s, #17 449 sri v30.4s, v27.4s, #9 450 eor v27.16b, v27.16b, v29.16b 451 ldp w15, w20, [x26, #20] 452 ldr w19, [x26, #40] 453 ldr w21, [x26, #48] 454 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 455 eor v27.16b, v27.16b, v30.16b 456 mov v13.s[3], v13.s[2] // Due to ext requirements, to fill s[3] 457 eor v14.16b, v27.16b, v28.16b 458 459 // 12:49, 50, 51 460 ext v24.16b, v10.16b, v11.16b, #12 // 36, 37, 38 461 eor v27.16b, v23.16b, v12.16b 462 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 463 shl v21.4s, v14.4s, #15 464 shl v22.4s, v24.4s, #7 465 sri v21.4s, v14.4s, #17 // 46, 47, 48<<<15 466 sri v22.4s, v24.4s, #25 // 36, 37, 38<<<7 467 ldp w15, w20, [x26, #32] 468 ldp w19, w21, [x26, #52] 469 eor v27.16b, v21.16b, v27.16b 470 eor v28.16b, v22.16b, v13.16b 471 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 472 shl v29.4s, v27.4s, #15 473 shl v30.4s, v27.4s, #23 474 sri v29.4s, v27.4s, #17 475 sri v30.4s, v27.4s, #9 476 eor v27.16b, v27.16b, v29.16b 477 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 478 eor v27.16b, v27.16b, v30.16b 479 mov v14.s[3], v14.s[2] // Due to ext requirements, to fill s[3] 480 eor v15.16b, v27.16b, v28.16b 481 482 // 13:52, 53, 54 483 ext v23.16b, v11.16b, v12.16b, #12 // 39, 40, 41 484 eor v27.16b, v24.16b, v13.16b 485 st1 {v12.4s-v15.4s}, [x25] 486 shl v21.4s, v15.4s, #15 487 shl v22.4s, v23.4s, #7 488 sri v21.4s, v15.4s, #17 // 49, 50, 51<<<15 489 sri v22.4s, v23.4s, #25 // 39, 40, 41<<<7 490 ldr w15, [x26, #40] 491 ldr w20, [x26, #48] 492 ldp w19, w21, [x25] 493 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 494 eor v27.16b, v21.16b, v27.16b 495 eor v28.16b, v22.16b, v14.16b 496 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 497 shl v29.4s, v27.4s, #15 498 shl v30.4s, v27.4s, #23 499 sri v29.4s, v27.4s, #17 500 sri v30.4s, v27.4s, #9 501 eor v27.16b, v27.16b, v29.16b 502 ldp w15, w20, [x26, #52] 503 ldr w19, [x25, #8] 504 ldr w21, [x25, #16] 505 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 506 eor v27.16b, v27.16b, v30.16b 507 mov v15.s[3], v15.s[2] // Due to ext requirements, to fill s[3] 508 eor v16.16b, v27.16b, v28.16b 509 510 // 14:55, 56, 57 511 ext v24.16b, v12.16b, v13.16b, #12 // 42, 43, 44 512 eor v27.16b, v23.16b, v14.16b 513 shl v21.4s, v16.4s, #15 514 shl v22.4s, v24.4s, #7 515 sri v21.4s, v16.4s, #17 // 52, 53, 54<<<15 516 sri v22.4s, v24.4s, #25 // 42, 43, 44<<<7 517 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 518 eor v27.16b, v21.16b, v27.16b 519 eor v28.16b, v22.16b, v15.16b 520 ldp w15, w20, [x25] 521 ldp w19, w21, [x25, #20] 522 shl v29.4s, v27.4s, #15 523 shl v30.4s, v27.4s, #23 524 sri v29.4s, v27.4s, #17 525 sri v30.4s, v27.4s, #9 526 eor v27.16b, v27.16b, v29.16b 527 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 528 eor v27.16b, v27.16b, v30.16b 529 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 530 mov v16.s[3], v16.s[2] // Due to ext requirements, to fill s[3] 531 eor v17.16b, v27.16b, v28.16b 532 533 // 15:58, 59, 60 534 ext v23.16b, v13.16b, v14.16b, #12 // 45, 46, 47 535 eor v27.16b, v24.16b, v15.16b 536 shl v21.4s, v17.4s, #15 537 shl v22.4s, v23.4s, #7 538 sri v21.4s, v17.4s, #17 // 55, 56, 57<<<15 539 sri v22.4s, v23.4s, #25 // 45, 46, 47<<<7 540 ldr w15, [x25, #8] 541 ldr w20, [x25, #16] 542 ldp w19, w21, [x25, #32] 543 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 544 eor v27.16b, v21.16b, v27.16b 545 eor v28.16b, v22.16b, v16.16b 546 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 547 shl v29.4s, v27.4s, #15 548 shl v30.4s, v27.4s, #23 549 sri v29.4s, v27.4s, #17 550 sri v30.4s, v27.4s, #9 551 eor v27.16b, v27.16b, v29.16b 552 ldp w15, w20, [x25, #20] 553 ldr w19, [x25, #40] 554 ldr w21, [x25, #48] 555 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 556 eor v27.16b, v27.16b, v30.16b 557 eor v18.16b, v27.16b, v28.16b 558 559 // 16:61, 62, 63 560 ext v24.16b, v14.16b, v15.16b, #12 // 48, 49, 50 561 eor v27.16b, v23.16b, v16.16b 562 shl v21.4s, v18.4s, #15 563 shl v22.4s, v24.4s, #7 564 sri v21.4s, v18.4s, #17 // 58, 59, 60<<<15 565 sri v22.4s, v24.4s, #25 // 48, 49, 50<<<7 566 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 567 eor v27.16b, v21.16b, v27.16b 568 eor v28.16b, v22.16b, v17.16b 569 ldp w15, w20, [x25, #32] 570 ldp w19, w21, [x25, #52] 571 shl v29.4s, v27.4s, #15 572 shl v30.4s, v27.4s, #23 573 sri v29.4s, v27.4s, #17 574 sri v30.4s, v27.4s, #9 575 eor v27.16b, v27.16b, v29.16b 576 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 577 eor v27.16b, v27.16b, v30.16b 578 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 579 eor v19.16b, v27.16b, v28.16b 580 581 // 17:64, 65, 66 582 ext v23.16b, v15.16b, v16.16b, #12 // 51, 52, 53 583 eor v27.16b, v24.16b, v17.16b 584 st1 {v16.4s-v19.4s}, [x26] 585 shl v21.4s, v19.4s, #15 586 shl v22.4s, v23.4s, #7 587 sri v21.4s, v19.4s, #17 // 61, 62, 63<<<15 588 sri v22.4s, v23.4s, #25 // 51, 52, 53<<<7 589 ldr w15, [x25, #40] 590 ldr w20, [x25, #48] 591 ldp w19, w21, [x26] 592 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 593 eor v27.16b, v21.16b, v27.16b 594 eor v28.16b, v22.16b, v18.16b 595 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 596 shl v29.4s, v27.4s, #15 597 shl v30.4s, v27.4s, #23 598 sri v29.4s, v27.4s, #17 599 sri v30.4s, v27.4s, #9 600 eor v27.16b, v27.16b, v29.16b 601 ldp w15, w20, [x25, #52] 602 ldr w19, [x26, #8] 603 ldr w21, [x26, #16] 604 eor v27.16b, v27.16b, v30.16b 605 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 606 eor v20.16b, v27.16b, v28.16b 607 608 // 18:67 609 ext v24.16b, v16.16b, v17.16b, #12 // 54, 55, 56 610 eor v27.16b, v23.16b, v18.16b 611 shl v21.4s, v20.4s, #15 612 shl v22.4s, v24.4s, #7 613 sri v21.4s, v20.4s, #17 // 64, 65, 66<<<15 614 sri v22.4s, v24.4s, #25 // 54, 55, 56<<<7 615 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 616 eor v27.16b, v21.16b, v27.16b 617 eor v28.16b, v22.16b, v19.16b 618 ldp w15, w20, [x26] 619 ldp w19, w21, [x26, #20] 620 shl v29.4s, v27.4s, #15 621 shl v30.4s, v27.4s, #23 622 sri v29.4s, v27.4s, #17 623 sri v30.4s, v27.4s, #9 624 eor v27.16b, v27.16b, v29.16b 625 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 626 eor v27.16b, v27.16b, v30.16b 627 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 628 eor v21.16b, v27.16b, v28.16b 629 630 ldr w15, [x26, #8] 631 ldr w20, [x26, #16] 632 ldp w19, w21, [x26, #32] 633 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 634 st1 {v20.4s-v21.4s}, [x25] 635 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 636 ldp w15, w20, [x26, #20] 637 ldr w19, [x26, #40] 638 ldr w21, [x26, #48] 639 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 640 641 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 642 ldp w15, w20, [x26, #32] 643 ldp w19, w21, [x26, #52] 644 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 645 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 646 ldr w15, [x26, #40] 647 ldr w20, [x26, #48] 648 ldp w19, w21, [x25] 649 second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19 650 second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21 651 ldp w15, w20, [x26, #52] 652 ldr w19, [x25, #8] 653 ldr w21, [x25, #16] 654 second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19 655 second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21 656 ldp w9, w10, [x22] // XOR with the previous hash result 657 ldp w11, w12, [x22, #8] 658 ldp w13, w14, [x22, #16] 659 ldp w15, w19, [x22, #24] 660 eor w0, w0, w9 661 eor w1, w1, w10 662 eor w2, w2, w11 663 eor w3, w3, w12 664 eor w4, w4, w13 665 eor w5, w5, w14 666 eor w6, w6, w15 667 eor w7, w7, w19 668 stp w0, w1, [x22] // Result saving 669 stp w2, w3, [x22, #8] 670 stp w4, w5, [x22, #16] 671 stp w6, w7, [x22, #24] 672 b blocksloop_1 673 end: 674 675 add sp, sp, 128 676 677 ldp x19, x20, [sp] 678 ldp x21, x22, [sp, #16] 679 ldp x23, x24, [sp, #32] 680 ldp x25, x26, [sp, #48] 681 ldp d8, d9, [sp, #64] 682 ldp d10, d11, [sp, #80] 683 ldp d12, d13, [sp, #96] 684 ldp d14, d15, [sp, #112] 685 add sp, sp, 128 686 687AARCH64_AUTIASP 688 ret 689.size SM3_CompressAsm,.-SM3_CompressAsm 690 691#endif 692