1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions 4 * 5 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 .text 12 .arch armv8-a+crypto 13 14 dga .req q20 15 dgav .req v20 16 dgb .req q21 17 dgbv .req v21 18 19 t0 .req v22 20 t1 .req v23 21 22 dg0q .req q24 23 dg0v .req v24 24 dg1q .req q25 25 dg1v .req v25 26 dg2q .req q26 27 dg2v .req v26 28 29 .macro add_only, ev, rc, s0 30 mov dg2v.16b, dg0v.16b 31 .ifeq \ev 32 add t1.4s, v\s0\().4s, \rc\().4s 33 sha256h dg0q, dg1q, t0.4s 34 sha256h2 dg1q, dg2q, t0.4s 35 .else 36 .ifnb \s0 37 add t0.4s, v\s0\().4s, \rc\().4s 38 .endif 39 sha256h dg0q, dg1q, t1.4s 40 sha256h2 dg1q, dg2q, t1.4s 41 .endif 42 .endm 43 44 .macro add_update, ev, rc, s0, s1, s2, s3 45 sha256su0 v\s0\().4s, v\s1\().4s 46 add_only \ev, \rc, \s1 47 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s 48 .endm 49 50 /* 51 * The SHA-256 round constants 52 */ 53 .section ".rodata", "a" 54 .align 4 55.Lsha2_rcon: 56 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 57 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 58 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 59 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 60 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 61 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 62 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 63 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 64 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 65 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 66 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 67 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 68 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 69 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 70 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 71 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 72 73 .macro load_round_constants tmp 74 adr_l \tmp, .Lsha2_rcon 75 ld1 { v0.4s- v3.4s}, [\tmp], #64 76 ld1 { v4.4s- v7.4s}, [\tmp], #64 77 ld1 { v8.4s-v11.4s}, [\tmp], #64 78 ld1 {v12.4s-v15.4s}, [\tmp] 79 .endm 80 81 /* 82 * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src, 83 * int blocks) 84 */ 85 .text 86SYM_FUNC_START(__sha256_ce_transform) 87 88 load_round_constants x8 89 90 /* load state */ 91 ld1 {dgav.4s, dgbv.4s}, [x0] 92 93 /* load sha256_ce_state::finalize */ 94 ldr_l w4, sha256_ce_offsetof_finalize, x4 95 ldr w4, [x0, x4] 96 97 /* load input */ 980: ld1 {v16.4s-v19.4s}, [x1], #64 99 sub w2, w2, #1 100 101CPU_LE( rev32 v16.16b, v16.16b ) 102CPU_LE( rev32 v17.16b, v17.16b ) 103CPU_LE( rev32 v18.16b, v18.16b ) 104CPU_LE( rev32 v19.16b, v19.16b ) 105 1061: add t0.4s, v16.4s, v0.4s 107 mov dg0v.16b, dgav.16b 108 mov dg1v.16b, dgbv.16b 109 110 add_update 0, v1, 16, 17, 18, 19 111 add_update 1, v2, 17, 18, 19, 16 112 add_update 0, v3, 18, 19, 16, 17 113 add_update 1, v4, 19, 16, 17, 18 114 115 add_update 0, v5, 16, 17, 18, 19 116 add_update 1, v6, 17, 18, 19, 16 117 add_update 0, v7, 18, 19, 16, 17 118 add_update 1, v8, 19, 16, 17, 18 119 120 add_update 0, v9, 16, 17, 18, 19 121 add_update 1, v10, 17, 18, 19, 16 122 add_update 0, v11, 18, 19, 16, 17 123 add_update 1, v12, 19, 16, 17, 18 124 125 add_only 0, v13, 17 126 add_only 1, v14, 18 127 add_only 0, v15, 19 128 add_only 1 129 130 /* update state */ 131 add dgav.4s, dgav.4s, dg0v.4s 132 add dgbv.4s, dgbv.4s, dg1v.4s 133 134 /* handled all input blocks? */ 135 cbz w2, 2f 136 cond_yield 3f, x5, x6 137 b 0b 138 139 /* 140 * Final block: add padding and total bit count. 141 * Skip if the input size was not a round multiple of the block size, 142 * the padding is handled by the C code in that case. 143 */ 1442: cbz x4, 3f 145 ldr_l w4, sha256_ce_offsetof_count, x4 146 ldr x4, [x0, x4] 147 movi v17.2d, #0 148 mov x8, #0x80000000 149 movi v18.2d, #0 150 ror x7, x4, #29 // ror(lsl(x4, 3), 32) 151 fmov d16, x8 152 mov x4, #0 153 mov v19.d[0], xzr 154 mov v19.d[1], x7 155 b 1b 156 157 /* store new state */ 1583: st1 {dgav.4s, dgbv.4s}, [x0] 159 mov w0, w2 160 ret 161SYM_FUNC_END(__sha256_ce_transform) 162 163 .unreq dga 164 .unreq dgav 165 .unreq dgb 166 .unreq dgbv 167 .unreq t0 168 .unreq t1 169 .unreq dg0q 170 .unreq dg0v 171 .unreq dg1q 172 .unreq dg1v 173 .unreq dg2q 174 .unreq dg2v 175 176 // parameters for __sha256_ce_finup2x() 177 sctx .req x0 178 data1 .req x1 179 data2 .req x2 180 len .req w3 181 out1 .req x4 182 out2 .req x5 183 184 // other scalar variables 185 count .req x6 186 final_step .req w7 187 188 // x8-x9 are used as temporaries. 189 190 // v0-v15 are used to cache the SHA-256 round constants. 191 // v16-v19 are used for the message schedule for the first message. 192 // v20-v23 are used for the message schedule for the second message. 193 // v24-v31 are used for the state and temporaries as given below. 194 // *_a are for the first message and *_b for the second. 195 state0_a_q .req q24 196 state0_a .req v24 197 state1_a_q .req q25 198 state1_a .req v25 199 state0_b_q .req q26 200 state0_b .req v26 201 state1_b_q .req q27 202 state1_b .req v27 203 t0_a .req v28 204 t0_b .req v29 205 t1_a_q .req q30 206 t1_a .req v30 207 t1_b_q .req q31 208 t1_b .req v31 209 210#define OFFSETOF_COUNT 32 // offsetof(struct sha256_state, count) 211#define OFFSETOF_BUF 40 // offsetof(struct sha256_state, buf) 212// offsetof(struct sha256_state, state) is assumed to be 0. 213 214 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a 215 // and m0_b contain the current 4 message schedule words for the first 216 // and second message respectively. 217 // 218 // If not all the message schedule words have been computed yet, then 219 // this also computes 4 more message schedule words for each message. 220 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for 221 // the first message, and likewise m1_b-m3_b for the second. After 222 // consuming the current value of m0_a, this macro computes the group 223 // after m3_a and writes it to m0_a, and likewise for *_b. This means 224 // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, 225 // m3_a, m0_a), and likewise for *_b, so the caller must cycle through 226 // the registers accordingly. 227 .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ 228 m0_b, m1_b, m2_b, m3_b 229 add t0_a\().4s, \m0_a\().4s, \k\().4s 230 add t0_b\().4s, \m0_b\().4s, \k\().4s 231 .if \i < 48 232 sha256su0 \m0_a\().4s, \m1_a\().4s 233 sha256su0 \m0_b\().4s, \m1_b\().4s 234 sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s 235 sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s 236 .endif 237 mov t1_a.16b, state0_a.16b 238 mov t1_b.16b, state0_b.16b 239 sha256h state0_a_q, state1_a_q, t0_a\().4s 240 sha256h state0_b_q, state1_b_q, t0_b\().4s 241 sha256h2 state1_a_q, t1_a_q, t0_a\().4s 242 sha256h2 state1_b_q, t1_b_q, t0_b\().4s 243 .endm 244 245 .macro do_16rounds_2x i, k0, k1, k2, k3 246 do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 247 do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 248 do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 249 do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 250 .endm 251 252// 253// void __sha256_ce_finup2x(const struct sha256_state *sctx, 254// const u8 *data1, const u8 *data2, int len, 255// u8 out1[SHA256_DIGEST_SIZE], 256// u8 out2[SHA256_DIGEST_SIZE]); 257// 258// This function computes the SHA-256 digests of two messages |data1| and 259// |data2| that are both |len| bytes long, starting from the initial state 260// |sctx|. |len| must be at least SHA256_BLOCK_SIZE. 261// 262// The instructions for the two SHA-256 operations are interleaved. On many 263// CPUs, this is almost twice as fast as hashing each message individually due 264// to taking better advantage of the CPU's SHA-256 and SIMD throughput. 265// 266SYM_FUNC_START(__sha256_ce_finup2x) 267 sub sp, sp, #128 268 mov final_step, #0 269 load_round_constants x8 270 271 // Load the initial state from sctx->state. 272 ld1 {state0_a.4s-state1_a.4s}, [sctx] 273 274 // Load sctx->count. Take the mod 64 of it to get the number of bytes 275 // that are buffered in sctx->buf. Also save it in a register with len 276 // added to it. 277 ldr x8, [sctx, #OFFSETOF_COUNT] 278 add count, x8, len, sxtw 279 and x8, x8, #63 280 cbz x8, .Lfinup2x_enter_loop // No bytes buffered? 281 282 // x8 bytes (1 to 63) are currently buffered in sctx->buf. Load them 283 // followed by the first 64 - x8 bytes of data. Since len >= 64, we 284 // just load 64 bytes from each of sctx->buf, data1, and data2 285 // unconditionally and rearrange the data as needed. 286 add x9, sctx, #OFFSETOF_BUF 287 ld1 {v16.16b-v19.16b}, [x9] 288 st1 {v16.16b-v19.16b}, [sp] 289 290 ld1 {v16.16b-v19.16b}, [data1], #64 291 add x9, sp, x8 292 st1 {v16.16b-v19.16b}, [x9] 293 ld1 {v16.4s-v19.4s}, [sp] 294 295 ld1 {v20.16b-v23.16b}, [data2], #64 296 st1 {v20.16b-v23.16b}, [x9] 297 ld1 {v20.4s-v23.4s}, [sp] 298 299 sub len, len, #64 300 sub data1, data1, x8 301 sub data2, data2, x8 302 add len, len, w8 303 mov state0_b.16b, state0_a.16b 304 mov state1_b.16b, state1_a.16b 305 b .Lfinup2x_loop_have_data 306 307.Lfinup2x_enter_loop: 308 sub len, len, #64 309 mov state0_b.16b, state0_a.16b 310 mov state1_b.16b, state1_a.16b 311.Lfinup2x_loop: 312 // Load the next two data blocks. 313 ld1 {v16.4s-v19.4s}, [data1], #64 314 ld1 {v20.4s-v23.4s}, [data2], #64 315.Lfinup2x_loop_have_data: 316 // Convert the words of the data blocks from big endian. 317CPU_LE( rev32 v16.16b, v16.16b ) 318CPU_LE( rev32 v17.16b, v17.16b ) 319CPU_LE( rev32 v18.16b, v18.16b ) 320CPU_LE( rev32 v19.16b, v19.16b ) 321CPU_LE( rev32 v20.16b, v20.16b ) 322CPU_LE( rev32 v21.16b, v21.16b ) 323CPU_LE( rev32 v22.16b, v22.16b ) 324CPU_LE( rev32 v23.16b, v23.16b ) 325.Lfinup2x_loop_have_bswapped_data: 326 327 // Save the original state for each block. 328 st1 {state0_a.4s-state1_b.4s}, [sp] 329 330 // Do the SHA-256 rounds on each block. 331 do_16rounds_2x 0, v0, v1, v2, v3 332 do_16rounds_2x 16, v4, v5, v6, v7 333 do_16rounds_2x 32, v8, v9, v10, v11 334 do_16rounds_2x 48, v12, v13, v14, v15 335 336 // Add the original state for each block. 337 ld1 {v16.4s-v19.4s}, [sp] 338 add state0_a.4s, state0_a.4s, v16.4s 339 add state1_a.4s, state1_a.4s, v17.4s 340 add state0_b.4s, state0_b.4s, v18.4s 341 add state1_b.4s, state1_b.4s, v19.4s 342 343 // Update len and loop back if more blocks remain. 344 sub len, len, #64 345 tbz len, #31, .Lfinup2x_loop // len >= 0? 346 347 // Check if any final blocks need to be handled. 348 // final_step = 2: all done 349 // final_step = 1: need to do count-only padding block 350 // final_step = 0: need to do the block with 0x80 padding byte 351 tbnz final_step, #1, .Lfinup2x_done 352 tbnz final_step, #0, .Lfinup2x_finalize_countonly 353 add len, len, #64 354 cbz len, .Lfinup2x_finalize_blockaligned 355 356 // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. 357 // To do this, write the padding starting with the 0x80 byte to 358 // &sp[64]. Then for each message, copy the last 64 data bytes to sp 359 // and load from &sp[64 - len] to get the needed padding block. This 360 // code relies on the data buffers being >= 64 bytes in length. 361 sub w8, len, #64 // w8 = len - 64 362 add data1, data1, w8, sxtw // data1 += len - 64 363 add data2, data2, w8, sxtw // data2 += len - 64 364 mov x9, 0x80 365 fmov d16, x9 366 movi v17.16b, #0 367 stp q16, q17, [sp, #64] 368 stp q17, q17, [sp, #96] 369 sub x9, sp, w8, sxtw // x9 = &sp[64 - len] 370 cmp len, #56 371 b.ge 1f // will count spill into its own block? 372 lsl count, count, #3 373 rev count, count 374 str count, [x9, #56] 375 mov final_step, #2 // won't need count-only block 376 b 2f 3771: 378 mov final_step, #1 // will need count-only block 3792: 380 ld1 {v16.16b-v19.16b}, [data1] 381 st1 {v16.16b-v19.16b}, [sp] 382 ld1 {v16.4s-v19.4s}, [x9] 383 ld1 {v20.16b-v23.16b}, [data2] 384 st1 {v20.16b-v23.16b}, [sp] 385 ld1 {v20.4s-v23.4s}, [x9] 386 b .Lfinup2x_loop_have_data 387 388 // Prepare a padding block, either: 389 // 390 // {0x80, 0, 0, 0, ..., count (as __be64)} 391 // This is for a block aligned message. 392 // 393 // { 0, 0, 0, 0, ..., count (as __be64)} 394 // This is for a message whose length mod 64 is >= 56. 395 // 396 // Pre-swap the endianness of the words. 397.Lfinup2x_finalize_countonly: 398 movi v16.2d, #0 399 b 1f 400.Lfinup2x_finalize_blockaligned: 401 mov x8, #0x80000000 402 fmov d16, x8 4031: 404 movi v17.2d, #0 405 movi v18.2d, #0 406 ror count, count, #29 // ror(lsl(count, 3), 32) 407 mov v19.d[0], xzr 408 mov v19.d[1], count 409 mov v20.16b, v16.16b 410 movi v21.2d, #0 411 movi v22.2d, #0 412 mov v23.16b, v19.16b 413 mov final_step, #2 414 b .Lfinup2x_loop_have_bswapped_data 415 416.Lfinup2x_done: 417 // Write the two digests with all bytes in the correct order. 418CPU_LE( rev32 state0_a.16b, state0_a.16b ) 419CPU_LE( rev32 state1_a.16b, state1_a.16b ) 420CPU_LE( rev32 state0_b.16b, state0_b.16b ) 421CPU_LE( rev32 state1_b.16b, state1_b.16b ) 422 st1 {state0_a.4s-state1_a.4s}, [out1] 423 st1 {state0_b.4s-state1_b.4s}, [out2] 424 add sp, sp, #128 425 ret 426SYM_FUNC_END(__sha256_ce_finup2x) 427