1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv4. 17# 18# June 2017. 19# 20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit 21# interleaving. How does it compare to Keccak Code Package? It's as 22# fast, but several times smaller, and is endian- and ISA-neutral. ISA 23# neutrality means that minimum ISA requirement is ARMv4, yet it can 24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with 25# register layout taken from Keccak Code Package. It's also as fast, 26# in fact faster by 10-15% on some processors, and endian-neutral. 27# 28# August 2017. 29# 30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2 31# of rotate instructions with logical ones. This resulted in ~10% 32# improvement on most processors. Switch to KECCAK_2X effectively 33# minimizes re-loads from temporary storage, and merged rotates just 34# eliminate corresponding instructions. As for latter. When examining 35# code you'll notice commented ror instructions. These are eliminated 36# ones, and you should trace destination register below to see what's 37# going on. Just in case, why not all rotates are eliminated. Trouble 38# is that you have operations that require both inputs to be rotated, 39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using 40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation 41# that takes 'a' as input. And thing is that this next operation can 42# be in next round. It's totally possible to "carry" rotate "factors" 43# to the next round, but it makes code more complex. And the last word 44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the 45# time being]... 46# 47# Reduce per-round instruction count in Thumb-2 case by 16%. This is 48# achieved by folding ldr/str pairs to their double-word counterparts. 49# Theoretically this should have improved performance on single-issue 50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as 51# usual... 52# 53######################################################################## 54# Numbers are cycles per processed byte. Non-NEON results account even 55# for input bit interleaving. 56# 57# r=1088(*) Thumb-2(**) NEON 58# 59# ARM11xx 82/+150% 60# Cortex-A5 88/+160%, 86, 36 61# Cortex-A7 78/+160%, 68, 34 62# Cortex-A8 51/+230%, 57, 30 63# Cortex-A9 53/+210%, 51, 26 64# Cortex-A15 42/+160%, 38, 18 65# Snapdragon S4 43/+210%, 38, 24 66# 67# (*) Corresponds to SHA3-256. Percentage after slash is improvement 68# over compiler-generated KECCAK_2X reference code. 69# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to 70# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable 71# processors are presented mostly for reference purposes. 72 73$flavour = shift; 74if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 75else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 76 77if ($flavour && $flavour ne "void") { 78 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 79 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 80 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 81 die "can't locate arm-xlate.pl"; 82 83 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 84} else { 85 open STDOUT,">$output"; 86} 87 88my @C = map("r$_",(0..9)); 89my @E = map("r$_",(10..12,14)); 90 91######################################################################## 92# Stack layout 93# ----->+-----------------------+ 94# | uint64_t A[5][5] | 95# | ... | 96# +200->+-----------------------+ 97# | uint64_t D[5] | 98# | ... | 99# +240->+-----------------------+ 100# | uint64_t T[5][5] | 101# | ... | 102# +440->+-----------------------+ 103# | saved lr | 104# +444->+-----------------------+ 105# | loop counter | 106# +448->+-----------------------+ 107# | ... 108 109my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); 110my @D = map(8*$_, (25..29)); 111my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50)); 112 113$code.=<<___; 114#include "arm_arch.h" 115 116.text 117 118#if defined(__thumb2__) 119.syntax unified 120.thumb 121#else 122.code 32 123#endif 124 125.type iotas32, %object 126.align 5 127iotas32: 128 .long 0x00000001, 0x00000000 129 .long 0x00000000, 0x00000089 130 .long 0x00000000, 0x8000008b 131 .long 0x00000000, 0x80008080 132 .long 0x00000001, 0x0000008b 133 .long 0x00000001, 0x00008000 134 .long 0x00000001, 0x80008088 135 .long 0x00000001, 0x80000082 136 .long 0x00000000, 0x0000000b 137 .long 0x00000000, 0x0000000a 138 .long 0x00000001, 0x00008082 139 .long 0x00000000, 0x00008003 140 .long 0x00000001, 0x0000808b 141 .long 0x00000001, 0x8000000b 142 .long 0x00000001, 0x8000008a 143 .long 0x00000001, 0x80000081 144 .long 0x00000000, 0x80000081 145 .long 0x00000000, 0x80000008 146 .long 0x00000000, 0x00000083 147 .long 0x00000000, 0x80008003 148 .long 0x00000001, 0x80008088 149 .long 0x00000000, 0x80000088 150 .long 0x00000001, 0x00008000 151 .long 0x00000000, 0x80008082 152.size iotas32,.-iotas32 153 154.type KeccakF1600_int, %function 155.align 5 156KeccakF1600_int: 157 add @C[9],sp,#$A[4][2] 158 add @E[2],sp,#$A[0][0] 159 add @E[0],sp,#$A[1][0] 160 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4] 161KeccakF1600_enter: 162 str lr,[sp,#440] 163 eor @E[1],@E[1],@E[1] 164 str @E[1],[sp,#444] 165 b .Lround2x 166 167.align 4 168.Lround2x: 169___ 170sub Round { 171my (@A,@R); (@A[0..4],@R) = @_; 172 173$code.=<<___; 174 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1] 175 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1] 176#ifdef __thumb2__ 177 eor @C[0],@C[0],@E[0] 178 eor @C[1],@C[1],@E[1] 179 eor @C[2],@C[2],@E[2] 180 ldrd @E[0],@E[1],[sp,#$A[1][2]] 181 eor @C[3],@C[3],@E[3] 182 ldrd @E[2],@E[3],[sp,#$A[1][3]] 183 eor @C[4],@C[4],@E[0] 184 eor @C[5],@C[5],@E[1] 185 eor @C[6],@C[6],@E[2] 186 ldrd @E[0],@E[1],[sp,#$A[1][4]] 187 eor @C[7],@C[7],@E[3] 188 ldrd @E[2],@E[3],[sp,#$A[2][0]] 189 eor @C[8],@C[8],@E[0] 190 eor @C[9],@C[9],@E[1] 191 eor @C[0],@C[0],@E[2] 192 ldrd @E[0],@E[1],[sp,#$A[2][1]] 193 eor @C[1],@C[1],@E[3] 194 ldrd @E[2],@E[3],[sp,#$A[2][2]] 195 eor @C[2],@C[2],@E[0] 196 eor @C[3],@C[3],@E[1] 197 eor @C[4],@C[4],@E[2] 198 ldrd @E[0],@E[1],[sp,#$A[2][3]] 199 eor @C[5],@C[5],@E[3] 200 ldrd @E[2],@E[3],[sp,#$A[2][4]] 201 eor @C[6],@C[6],@E[0] 202 eor @C[7],@C[7],@E[1] 203 eor @C[8],@C[8],@E[2] 204 ldrd @E[0],@E[1],[sp,#$A[3][0]] 205 eor @C[9],@C[9],@E[3] 206 ldrd @E[2],@E[3],[sp,#$A[3][1]] 207 eor @C[0],@C[0],@E[0] 208 eor @C[1],@C[1],@E[1] 209 eor @C[2],@C[2],@E[2] 210 ldrd @E[0],@E[1],[sp,#$A[3][2]] 211 eor @C[3],@C[3],@E[3] 212 ldrd @E[2],@E[3],[sp,#$A[3][3]] 213 eor @C[4],@C[4],@E[0] 214 eor @C[5],@C[5],@E[1] 215 eor @C[6],@C[6],@E[2] 216 ldrd @E[0],@E[1],[sp,#$A[3][4]] 217 eor @C[7],@C[7],@E[3] 218 ldrd @E[2],@E[3],[sp,#$A[4][0]] 219 eor @C[8],@C[8],@E[0] 220 eor @C[9],@C[9],@E[1] 221 eor @C[0],@C[0],@E[2] 222 ldrd @E[0],@E[1],[sp,#$A[4][1]] 223 eor @C[1],@C[1],@E[3] 224 ldrd @E[2],@E[3],[sp,#$A[0][2]] 225 eor @C[2],@C[2],@E[0] 226 eor @C[3],@C[3],@E[1] 227 eor @C[4],@C[4],@E[2] 228 ldrd @E[0],@E[1],[sp,#$A[0][3]] 229 eor @C[5],@C[5],@E[3] 230 ldrd @E[2],@E[3],[sp,#$A[0][4]] 231#else 232 eor @C[0],@C[0],@E[0] 233 add @E[0],sp,#$A[1][2] 234 eor @C[1],@C[1],@E[1] 235 eor @C[2],@C[2],@E[2] 236 eor @C[3],@C[3],@E[3] 237 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3] 238 eor @C[4],@C[4],@E[0] 239 add @E[0],sp,#$A[1][4] 240 eor @C[5],@C[5],@E[1] 241 eor @C[6],@C[6],@E[2] 242 eor @C[7],@C[7],@E[3] 243 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0] 244 eor @C[8],@C[8],@E[0] 245 add @E[0],sp,#$A[2][1] 246 eor @C[9],@C[9],@E[1] 247 eor @C[0],@C[0],@E[2] 248 eor @C[1],@C[1],@E[3] 249 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2] 250 eor @C[2],@C[2],@E[0] 251 add @E[0],sp,#$A[2][3] 252 eor @C[3],@C[3],@E[1] 253 eor @C[4],@C[4],@E[2] 254 eor @C[5],@C[5],@E[3] 255 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4] 256 eor @C[6],@C[6],@E[0] 257 add @E[0],sp,#$A[3][0] 258 eor @C[7],@C[7],@E[1] 259 eor @C[8],@C[8],@E[2] 260 eor @C[9],@C[9],@E[3] 261 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1] 262 eor @C[0],@C[0],@E[0] 263 add @E[0],sp,#$A[3][2] 264 eor @C[1],@C[1],@E[1] 265 eor @C[2],@C[2],@E[2] 266 eor @C[3],@C[3],@E[3] 267 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3] 268 eor @C[4],@C[4],@E[0] 269 add @E[0],sp,#$A[3][4] 270 eor @C[5],@C[5],@E[1] 271 eor @C[6],@C[6],@E[2] 272 eor @C[7],@C[7],@E[3] 273 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0] 274 eor @C[8],@C[8],@E[0] 275 ldr @E[0],[sp,#$A[4][1]] @ A[4][1] 276 eor @C[9],@C[9],@E[1] 277 ldr @E[1],[sp,#$A[4][1]+4] 278 eor @C[0],@C[0],@E[2] 279 ldr @E[2],[sp,#$A[0][2]] @ A[0][2] 280 eor @C[1],@C[1],@E[3] 281 ldr @E[3],[sp,#$A[0][2]+4] 282 eor @C[2],@C[2],@E[0] 283 add @E[0],sp,#$A[0][3] 284 eor @C[3],@C[3],@E[1] 285 eor @C[4],@C[4],@E[2] 286 eor @C[5],@C[5],@E[3] 287 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4] 288#endif 289 eor @C[6],@C[6],@E[0] 290 eor @C[7],@C[7],@E[1] 291 eor @C[8],@C[8],@E[2] 292 eor @C[9],@C[9],@E[3] 293 294 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; 295 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0] 296 eor @E[1],@C[1],@C[4] 297 str.h @E[1],[sp,#$D[1]+4] 298 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; 299 eor @E[3],@C[7],@C[0] 300 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1] 301 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; 302 str.h @E[3],[sp,#$D[4]+4] 303 eor @C[1],@C[9],@C[2] 304 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0] 305 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; 306 ldr.l @C[7],[sp,#$A[3][3]] 307 eor @C[3],@C[3],@C[6] 308 str.h @C[1],[sp,#$D[0]+4] 309 ldr.h @C[6],[sp,#$A[3][3]+4] 310 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1] 311 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; 312 str.h @C[3],[sp,#$D[2]+4] 313 eor @C[5],@C[5],@C[8] 314 315 ldr.l @C[8],[sp,#$A[4][4]] 316 ldr.h @C[9],[sp,#$A[4][4]+4] 317 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2] 318 eor @C[7],@C[7],@C[4] 319 str.h @C[5],[sp,#$D[3]+4] 320 eor @C[6],@C[6],@C[5] 321 ldr.l @C[4],[sp,#$A[0][0]] 322 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 323 @ ror @C[6],@C[6],#32-11 324 ldr.h @C[5],[sp,#$A[0][0]+4] 325 eor @C[8],@C[8],@E[2] 326 eor @C[9],@C[9],@E[3] 327 ldr.l @E[2],[sp,#$A[2][2]] 328 eor @C[0],@C[0],@C[4] 329 ldr.h @E[3],[sp,#$A[2][2]+4] 330 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 331 @ ror @C[9],@C[9],#32-7 332 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ 333 eor @E[2],@E[2],@C[2] 334 ldr.l @C[2],[sp,#$A[1][1]] 335 eor @E[3],@E[3],@C[3] 336 ldr.h @C[3],[sp,#$A[1][1]+4] 337 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 338 ldr @E[2],[sp,#444] @ load counter 339 eor @C[2],@C[2],@E[0] 340 adr @E[0],iotas32 341 ror @C[4],@E[3],#32-22 342 add @E[3],@E[0],@E[2] 343 eor @C[3],@C[3],@E[1] 344___ 345$code.=<<___ if ($A[0][0] != $T[0][0]); 346 ldmia @E[3],{@E[0],@E[1]} @ iotas[i] 347___ 348$code.=<<___ if ($A[0][0] == $T[0][0]); 349 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo 350 add @E[2],@E[2],#16 351 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi 352 cmp @E[2],#192 353 str @E[2],[sp,#444] @ store counter 354___ 355$code.=<<___; 356 bic @E[2],@C[4],@C[2],ror#32-22 357 bic @E[3],@C[5],@C[3],ror#32-22 358 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ 359 ror @C[3],@C[3],#32-22 360 eor @E[2],@E[2],@C[0] 361 eor @E[3],@E[3],@C[1] 362 eor @E[0],@E[0],@E[2] 363 eor @E[1],@E[1],@E[3] 364 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 365 bic @E[2],@C[6],@C[4],ror#11 366 str.h @E[1],[sp,#$R[0][0]+4] 367 bic @E[3],@C[7],@C[5],ror#10 368 bic @E[0],@C[8],@C[6],ror#32-(11-7) 369 bic @E[1],@C[9],@C[7],ror#32-(10-7) 370 eor @E[2],@C[2],@E[2],ror#32-11 371 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]); 372 eor @E[3],@C[3],@E[3],ror#32-10 373 str.h @E[3],[sp,#$R[0][1]+4] 374 eor @E[0],@C[4],@E[0],ror#32-7 375 eor @E[1],@C[5],@E[1],ror#32-7 376 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]); 377 bic @E[2],@C[0],@C[8],ror#32-7 378 str.h @E[1],[sp,#$R[0][2]+4] 379 bic @E[3],@C[1],@C[9],ror#32-7 380 eor @E[2],@E[2],@C[6],ror#32-11 381 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]); 382 eor @E[3],@E[3],@C[7],ror#32-10 383 str.h @E[3],[sp,#$R[0][3]+4] 384 bic @E[0],@C[2],@C[0] 385 add @E[3],sp,#$D[3] 386 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3] 387 bic @E[1],@C[3],@C[1] 388 ldr.h @C[1],[sp,#$A[0][3]+4] 389 eor @E[0],@E[0],@C[8],ror#32-7 390 eor @E[1],@E[1],@C[9],ror#32-7 391 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]); 392 add @C[9],sp,#$D[0] 393 str.h @E[1],[sp,#$R[0][4]+4] 394 395 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4] 396 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1] 397 398 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4] 399 eor @C[0],@C[0],@E[0] 400 ldr.h @C[3],[sp,#$A[1][4]+4] 401 eor @C[1],@C[1],@E[1] 402 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 403 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1] 404 @ ror @C[1],@C[1],#32-14 405 ldr.h @E[1],[sp,#$A[3][1]+4] 406 407 eor @C[2],@C[2],@E[2] 408 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0] 409 eor @C[3],@C[3],@E[3] 410 ldr.h @C[5],[sp,#$A[2][0]+4] 411 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 412 @ ror @C[3],@C[3],#32-10 413 414 eor @C[6],@C[6],@C[4] 415 ldr.l @E[2],[sp,#$D[2]] @ D[2] 416 eor @C[7],@C[7],@C[5] 417 ldr.h @E[3],[sp,#$D[2]+4] 418 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 419 ror @C[4],@C[7],#32-2 420 421 eor @E[0],@E[0],@C[8] 422 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2] 423 eor @E[1],@E[1],@C[9] 424 ldr.h @C[9],[sp,#$A[4][2]+4] 425 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 426 ror @C[6],@E[1],#32-23 427 428 bic @E[0],@C[4],@C[2],ror#32-10 429 bic @E[1],@C[5],@C[3],ror#32-10 430 eor @E[2],@E[2],@C[8] 431 eor @E[3],@E[3],@C[9] 432 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 433 ror @C[8],@E[3],#32-31 434 eor @E[0],@E[0],@C[0],ror#32-14 435 eor @E[1],@E[1],@C[1],ror#32-14 436 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2]) 437 bic @E[2],@C[6],@C[4] 438 str.h @E[1],[sp,#$R[1][0]+4] 439 bic @E[3],@C[7],@C[5] 440 eor @E[2],@E[2],@C[2],ror#32-10 441 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]); 442 eor @E[3],@E[3],@C[3],ror#32-10 443 str.h @E[3],[sp,#$R[1][1]+4] 444 bic @E[0],@C[8],@C[6] 445 bic @E[1],@C[9],@C[7] 446 bic @E[2],@C[0],@C[8],ror#14 447 bic @E[3],@C[1],@C[9],ror#14 448 eor @E[0],@E[0],@C[4] 449 eor @E[1],@E[1],@C[5] 450 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]); 451 bic @C[2],@C[2],@C[0],ror#32-(14-10) 452 str.h @E[1],[sp,#$R[1][2]+4] 453 eor @E[2],@C[6],@E[2],ror#32-14 454 bic @E[1],@C[3],@C[1],ror#32-(14-10) 455 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]); 456 eor @E[3],@C[7],@E[3],ror#32-14 457 str.h @E[3],[sp,#$R[1][3]+4] 458 add @E[2],sp,#$D[1] 459 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1] 460 eor @E[0],@C[8],@C[2],ror#32-10 461 ldr.h @C[0],[sp,#$A[0][1]+4] 462 eor @E[1],@C[9],@E[1],ror#32-10 463 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]); 464 str.h @E[1],[sp,#$R[1][4]+4] 465 466 add @C[9],sp,#$D[3] 467 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2] 468 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2] 469 ldr.h @C[3],[sp,#$A[1][2]+4] 470 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4] 471 472 eor @C[1],@C[1],@E[0] 473 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3] 474 eor @C[0],@C[0],@E[1] 475 ldr.h @C[5],[sp,#$A[2][3]+4] 476 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 477 478 eor @C[2],@C[2],@E[2] 479 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4] 480 eor @C[3],@C[3],@E[3] 481 ldr.h @E[1],[sp,#$A[3][4]+4] 482 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 483 ldr.l @E[2],[sp,#$D[0]] @ D[0] 484 @ ror @C[3],@C[3],#32-3 485 ldr.h @E[3],[sp,#$D[0]+4] 486 487 eor @C[4],@C[4],@C[6] 488 eor @C[5],@C[5],@C[7] 489 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 490 @ ror @C[4],@C[7],#32-13 @ [track reverse order below] 491 492 eor @E[0],@E[0],@C[8] 493 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0] 494 eor @E[1],@E[1],@C[9] 495 ldr.h @C[9],[sp,#$A[4][0]+4] 496 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 497 ror @C[7],@E[1],#32-4 498 499 eor @E[2],@E[2],@C[8] 500 eor @E[3],@E[3],@C[9] 501 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 502 ror @C[9],@E[3],#32-9 503 504 bic @E[0],@C[5],@C[2],ror#13-3 505 bic @E[1],@C[4],@C[3],ror#12-3 506 bic @E[2],@C[6],@C[5],ror#32-13 507 bic @E[3],@C[7],@C[4],ror#32-12 508 eor @E[0],@C[0],@E[0],ror#32-13 509 eor @E[1],@C[1],@E[1],ror#32-12 510 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2]) 511 eor @E[2],@E[2],@C[2],ror#32-3 512 str.h @E[1],[sp,#$R[2][0]+4] 513 eor @E[3],@E[3],@C[3],ror#32-3 514 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]); 515 bic @E[0],@C[8],@C[6] 516 bic @E[1],@C[9],@C[7] 517 str.h @E[3],[sp,#$R[2][1]+4] 518 eor @E[0],@E[0],@C[5],ror#32-13 519 eor @E[1],@E[1],@C[4],ror#32-12 520 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]); 521 bic @E[2],@C[0],@C[8] 522 str.h @E[1],[sp,#$R[2][2]+4] 523 bic @E[3],@C[1],@C[9] 524 eor @E[2],@E[2],@C[6] 525 eor @E[3],@E[3],@C[7] 526 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]); 527 bic @E[0],@C[2],@C[0],ror#3 528 str.h @E[3],[sp,#$R[2][3]+4] 529 bic @E[1],@C[3],@C[1],ror#3 530 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order] 531 eor @E[0],@C[8],@E[0],ror#32-3 532 ldr.h @C[0],[sp,#$A[0][4]+4] 533 eor @E[1],@C[9],@E[1],ror#32-3 534 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]); 535 add @C[9],sp,#$D[1] 536 str.h @E[1],[sp,#$R[2][4]+4] 537 538 ldr.l @E[0],[sp,#$D[4]] @ D[4] 539 ldr.h @E[1],[sp,#$D[4]+4] 540 ldr.l @E[2],[sp,#$D[0]] @ D[0] 541 ldr.h @E[3],[sp,#$D[0]+4] 542 543 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2] 544 545 eor @C[1],@C[1],@E[0] 546 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0] 547 eor @C[0],@C[0],@E[1] 548 ldr.h @C[3],[sp,#$A[1][0]+4] 549 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 550 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1] 551 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order] 552 ldr.h @C[5],[sp,#$A[2][1]+4] 553 554 eor @C[2],@C[2],@E[2] 555 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2] 556 eor @C[3],@C[3],@E[3] 557 ldr.h @E[1],[sp,#$A[3][2]+4] 558 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 559 ldr.l @E[2],[sp,#$D[3]] @ D[3] 560 @ ror @C[3],@C[3],#32-18 561 ldr.h @E[3],[sp,#$D[3]+4] 562 563 eor @C[6],@C[6],@C[4] 564 eor @C[7],@C[7],@C[5] 565 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 566 ror @C[5],@C[7],#32-5 567 568 eor @E[0],@E[0],@C[8] 569 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3] 570 eor @E[1],@E[1],@C[9] 571 ldr.h @C[9],[sp,#$A[4][3]+4] 572 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 573 ror @C[6],@E[1],#32-8 574 575 eor @E[2],@E[2],@C[8] 576 eor @E[3],@E[3],@C[9] 577 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 578 ror @C[9],@E[3],#32-28 579 580 bic @E[0],@C[4],@C[2],ror#32-18 581 bic @E[1],@C[5],@C[3],ror#32-18 582 eor @E[0],@E[0],@C[0],ror#32-14 583 eor @E[1],@E[1],@C[1],ror#32-13 584 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2]) 585 bic @E[2],@C[6],@C[4] 586 str.h @E[1],[sp,#$R[3][0]+4] 587 bic @E[3],@C[7],@C[5] 588 eor @E[2],@E[2],@C[2],ror#32-18 589 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]); 590 eor @E[3],@E[3],@C[3],ror#32-18 591 str.h @E[3],[sp,#$R[3][1]+4] 592 bic @E[0],@C[8],@C[6] 593 bic @E[1],@C[9],@C[7] 594 bic @E[2],@C[0],@C[8],ror#14 595 bic @E[3],@C[1],@C[9],ror#13 596 eor @E[0],@E[0],@C[4] 597 eor @E[1],@E[1],@C[5] 598 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]); 599 bic @C[2],@C[2],@C[0],ror#18-14 600 str.h @E[1],[sp,#$R[3][2]+4] 601 eor @E[2],@C[6],@E[2],ror#32-14 602 bic @E[1],@C[3],@C[1],ror#18-13 603 eor @E[3],@C[7],@E[3],ror#32-13 604 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]); 605 str.h @E[3],[sp,#$R[3][3]+4] 606 add @E[3],sp,#$D[2] 607 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2] 608 eor @E[0],@C[8],@C[2],ror#32-18 609 ldr.h @C[1],[sp,#$A[0][2]+4] 610 eor @E[1],@C[9],@E[1],ror#32-18 611 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]); 612 str.h @E[1],[sp,#$R[3][4]+4] 613 614 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3] 615 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3] 616 ldr.h @C[3],[sp,#$A[1][3]+4] 617 ldr.l @C[6],[sp,#$D[4]] @ D[4] 618 ldr.h @C[7],[sp,#$D[4]+4] 619 620 eor @C[0],@C[0],@E[0] 621 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4] 622 eor @C[1],@C[1],@E[1] 623 ldr.h @C[5],[sp,#$A[2][4]+4] 624 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 625 ldr.l @C[8],[sp,#$D[0]] @ D[0] 626 @ ror @C[1],@C[1],#32-31 627 ldr.h @C[9],[sp,#$D[0]+4] 628 629 eor @E[2],@E[2],@C[2] 630 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0] 631 eor @E[3],@E[3],@C[3] 632 ldr.h @E[1],[sp,#$A[3][0]+4] 633 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 634 ldr.l @E[2],[sp,#$D[1]] @ D[1] 635 ror @C[2],@E[3],#32-28 636 ldr.h @E[3],[sp,#$D[1]+4] 637 638 eor @C[6],@C[6],@C[4] 639 eor @C[7],@C[7],@C[5] 640 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 641 ror @C[4],@C[7],#32-20 642 643 eor @E[0],@E[0],@C[8] 644 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1] 645 eor @E[1],@E[1],@C[9] 646 ldr.h @C[9],[sp,#$A[4][1]+4] 647 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 648 ror @C[6],@E[1],#32-21 649 650 eor @C[8],@C[8],@E[2] 651 eor @C[9],@C[9],@E[3] 652 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 653 @ ror @C[9],@C[3],#32-1 654 655 bic @E[0],@C[4],@C[2] 656 bic @E[1],@C[5],@C[3] 657 eor @E[0],@E[0],@C[0],ror#32-31 658 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2]) 659 eor @E[1],@E[1],@C[1],ror#32-31 660 str.h @E[1],[sp,#$R[4][0]+4] 661 bic @E[2],@C[6],@C[4] 662 bic @E[3],@C[7],@C[5] 663 eor @E[2],@E[2],@C[2] 664 eor @E[3],@E[3],@C[3] 665 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]); 666 bic @E[0],@C[8],@C[6],ror#1 667 str.h @E[3],[sp,#$R[4][1]+4] 668 bic @E[1],@C[9],@C[7],ror#1 669 bic @E[2],@C[0],@C[8],ror#31-1 670 bic @E[3],@C[1],@C[9],ror#31-1 671 eor @C[4],@C[4],@E[0],ror#32-1 672 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]); 673 eor @C[5],@C[5],@E[1],ror#32-1 674 str.h @C[5],[sp,#$R[4][2]+4] 675 eor @C[6],@C[6],@E[2],ror#32-31 676 eor @C[7],@C[7],@E[3],ror#32-31 677 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]); 678 bic @E[0],@C[2],@C[0],ror#32-31 679 str.h @C[7],[sp,#$R[4][3]+4] 680 bic @E[1],@C[3],@C[1],ror#32-31 681 add @E[2],sp,#$R[0][0] 682 eor @C[8],@E[0],@C[8],ror#32-1 683 add @E[0],sp,#$R[1][0] 684 eor @C[9],@E[1],@C[9],ror#32-1 685 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]); 686 str.h @C[9],[sp,#$R[4][4]+4] 687___ 688} 689 Round(@A,@T); 690 Round(@T,@A); 691$code.=<<___; 692 blo .Lround2x 693 694 ldr pc,[sp,#440] 695.size KeccakF1600_int,.-KeccakF1600_int 696 697.type KeccakF1600, %function 698.align 5 699KeccakF1600: 700 stmdb sp!,{r0,r4-r11,lr} 701 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],... 702 703 add @E[0],r0,#$A[1][0] 704 add @E[1],sp,#$A[1][0] 705 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack 706 stmia sp, {@C[0]-@C[9]} 707 ldmia @E[0]!,{@C[0]-@C[9]} 708 stmia @E[1]!,{@C[0]-@C[9]} 709 ldmia @E[0]!,{@C[0]-@C[9]} 710 stmia @E[1]!,{@C[0]-@C[9]} 711 ldmia @E[0]!,{@C[0]-@C[9]} 712 stmia @E[1]!,{@C[0]-@C[9]} 713 ldmia @E[0], {@C[0]-@C[9]} 714 add @E[2],sp,#$A[0][0] 715 add @E[0],sp,#$A[1][0] 716 stmia @E[1], {@C[0]-@C[9]} 717 718 bl KeccakF1600_enter 719 720 ldr @E[1], [sp,#440+16] @ restore pointer to A 721 ldmia sp, {@C[0]-@C[9]} 722 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5] 723 ldmia @E[0]!,{@C[0]-@C[9]} 724 stmia @E[1]!,{@C[0]-@C[9]} 725 ldmia @E[0]!,{@C[0]-@C[9]} 726 stmia @E[1]!,{@C[0]-@C[9]} 727 ldmia @E[0]!,{@C[0]-@C[9]} 728 stmia @E[1]!,{@C[0]-@C[9]} 729 ldmia @E[0], {@C[0]-@C[9]} 730 stmia @E[1], {@C[0]-@C[9]} 731 732 add sp,sp,#440+20 733 ldmia sp!,{r4-r11,pc} 734.size KeccakF1600,.-KeccakF1600 735___ 736{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14)); 737 738######################################################################## 739# Stack layout 740# ----->+-----------------------+ 741# | uint64_t A[5][5] | 742# | ... | 743# | ... | 744# +456->+-----------------------+ 745# | 0x55555555 | 746# +460->+-----------------------+ 747# | 0x33333333 | 748# +464->+-----------------------+ 749# | 0x0f0f0f0f | 750# +468->+-----------------------+ 751# | 0x00ff00ff | 752# +472->+-----------------------+ 753# | uint64_t *A | 754# +476->+-----------------------+ 755# | const void *inp | 756# +480->+-----------------------+ 757# | size_t len | 758# +484->+-----------------------+ 759# | size_t bs | 760# +488->+-----------------------+ 761# | .... 762 763$code.=<<___; 764.global SHA3_absorb 765.type SHA3_absorb,%function 766.align 5 767SHA3_absorb: 768 stmdb sp!,{r0-r12,lr} 769 sub sp,sp,#456+16 770 771 add $A_flat,r0,#$A[1][0] 772 @ mov $inp,r1 773 mov $len,r2 774 mov $bsz,r3 775 cmp r2,r3 776 blo .Labsorb_abort 777 778 add $inp,sp,#0 779 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack 780 stmia $inp!, {@C[0]-@C[9]} 781 ldmia $A_flat!,{@C[0]-@C[9]} 782 stmia $inp!, {@C[0]-@C[9]} 783 ldmia $A_flat!,{@C[0]-@C[9]} 784 stmia $inp!, {@C[0]-@C[9]} 785 ldmia $A_flat!,{@C[0]-@C[9]} 786 stmia $inp!, {@C[0]-@C[9]} 787 ldmia $A_flat!,{@C[0]-@C[9]} 788 stmia $inp, {@C[0]-@C[9]} 789 790 ldr $inp,[sp,#476] @ restore $inp 791#ifdef __thumb2__ 792 mov r9,#0x00ff00ff 793 mov r8,#0x0f0f0f0f 794 mov r7,#0x33333333 795 mov r6,#0x55555555 796#else 797 mov r6,#0x11 @ compose constants 798 mov r8,#0x0f 799 mov r9,#0xff 800 orr r6,r6,r6,lsl#8 801 orr r8,r8,r8,lsl#8 802 orr r6,r6,r6,lsl#16 @ 0x11111111 803 orr r9,r9,r9,lsl#16 @ 0x00ff00ff 804 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f 805 orr r7,r6,r6,lsl#1 @ 0x33333333 806 orr r6,r6,r6,lsl#2 @ 0x55555555 807#endif 808 str r9,[sp,#468] 809 str r8,[sp,#464] 810 str r7,[sp,#460] 811 str r6,[sp,#456] 812 b .Loop_absorb 813 814.align 4 815.Loop_absorb: 816 subs r0,$len,$bsz 817 blo .Labsorbed 818 add $A_flat,sp,#0 819 str r0,[sp,#480] @ save len - bsz 820 821.align 4 822.Loop_block: 823 ldrb r0,[$inp],#1 824 ldrb r1,[$inp],#1 825 ldrb r2,[$inp],#1 826 ldrb r3,[$inp],#1 827 ldrb r4,[$inp],#1 828 orr r0,r0,r1,lsl#8 829 ldrb r1,[$inp],#1 830 orr r0,r0,r2,lsl#16 831 ldrb r2,[$inp],#1 832 orr r0,r0,r3,lsl#24 @ lo 833 ldrb r3,[$inp],#1 834 orr r1,r4,r1,lsl#8 835 orr r1,r1,r2,lsl#16 836 orr r1,r1,r3,lsl#24 @ hi 837 838 and r2,r0,r6 @ &=0x55555555 839 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa 840 and r3,r1,r6 @ &=0x55555555 841 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa 842 orr r2,r2,r2,lsr#1 843 orr r0,r0,r0,lsl#1 844 orr r3,r3,r3,lsr#1 845 orr r1,r1,r1,lsl#1 846 and r2,r2,r7 @ &=0x33333333 847 and r0,r0,r7,lsl#2 @ &=0xcccccccc 848 and r3,r3,r7 @ &=0x33333333 849 and r1,r1,r7,lsl#2 @ &=0xcccccccc 850 orr r2,r2,r2,lsr#2 851 orr r0,r0,r0,lsl#2 852 orr r3,r3,r3,lsr#2 853 orr r1,r1,r1,lsl#2 854 and r2,r2,r8 @ &=0x0f0f0f0f 855 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0 856 and r3,r3,r8 @ &=0x0f0f0f0f 857 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 858 ldmia $A_flat,{r4-r5} @ A_flat[i] 859 orr r2,r2,r2,lsr#4 860 orr r0,r0,r0,lsl#4 861 orr r3,r3,r3,lsr#4 862 orr r1,r1,r1,lsl#4 863 and r2,r2,r9 @ &=0x00ff00ff 864 and r0,r0,r9,lsl#8 @ &=0xff00ff00 865 and r3,r3,r9 @ &=0x00ff00ff 866 and r1,r1,r9,lsl#8 @ &=0xff00ff00 867 orr r2,r2,r2,lsr#8 868 orr r0,r0,r0,lsl#8 869 orr r3,r3,r3,lsr#8 870 orr r1,r1,r1,lsl#8 871 872 lsl r2,r2,#16 873 lsr r1,r1,#16 874 eor r4,r4,r3,lsl#16 875 eor r5,r5,r0,lsr#16 876 eor r4,r4,r2,lsr#16 877 eor r5,r5,r1,lsl#16 878 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7]) 879 880 subs $bsz,$bsz,#8 881 bhi .Loop_block 882 883 str $inp,[sp,#476] 884 885 bl KeccakF1600_int 886 887 add r14,sp,#456 888 ldmia r14,{r6-r12,r14} @ restore constants and variables 889 b .Loop_absorb 890 891.align 4 892.Labsorbed: 893 add $inp,sp,#$A[1][0] 894 ldmia sp, {@C[0]-@C[9]} 895 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5] 896 ldmia $inp!, {@C[0]-@C[9]} 897 stmia $A_flat!,{@C[0]-@C[9]} 898 ldmia $inp!, {@C[0]-@C[9]} 899 stmia $A_flat!,{@C[0]-@C[9]} 900 ldmia $inp!, {@C[0]-@C[9]} 901 stmia $A_flat!,{@C[0]-@C[9]} 902 ldmia $inp, {@C[0]-@C[9]} 903 stmia $A_flat, {@C[0]-@C[9]} 904 905.Labsorb_abort: 906 add sp,sp,#456+32 907 mov r0,$len @ return value 908 ldmia sp!,{r4-r12,pc} 909.size SHA3_absorb,.-SHA3_absorb 910___ 911} 912{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12)); 913 914$code.=<<___; 915.global SHA3_squeeze 916.type SHA3_squeeze,%function 917.align 5 918SHA3_squeeze: 919 stmdb sp!,{r0,r3-r10,lr} 920 921 mov $A_flat,r0 922 mov $out,r1 923 mov $len,r2 924 mov $bsz,r3 925 926#ifdef __thumb2__ 927 mov r9,#0x00ff00ff 928 mov r8,#0x0f0f0f0f 929 mov r7,#0x33333333 930 mov r6,#0x55555555 931#else 932 mov r6,#0x11 @ compose constants 933 mov r8,#0x0f 934 mov r9,#0xff 935 orr r6,r6,r6,lsl#8 936 orr r8,r8,r8,lsl#8 937 orr r6,r6,r6,lsl#16 @ 0x11111111 938 orr r9,r9,r9,lsl#16 @ 0x00ff00ff 939 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f 940 orr r7,r6,r6,lsl#1 @ 0x33333333 941 orr r6,r6,r6,lsl#2 @ 0x55555555 942#endif 943 stmdb sp!,{r6-r9} 944 945 mov r14,$A_flat 946 b .Loop_squeeze 947 948.align 4 949.Loop_squeeze: 950 ldmia $A_flat!,{r0,r1} @ A_flat[i++] 951 952 lsl r2,r0,#16 953 lsl r3,r1,#16 @ r3 = r1 << 16 954 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff 955 lsr r1,r1,#16 956 lsr r0,r0,#16 @ r0 = r0 >> 16 957 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000 958 959 orr r2,r2,r2,lsl#8 960 orr r3,r3,r3,lsr#8 961 orr r0,r0,r0,lsl#8 962 orr r1,r1,r1,lsr#8 963 and r2,r2,r9 @ &=0x00ff00ff 964 and r3,r3,r9,lsl#8 @ &=0xff00ff00 965 and r0,r0,r9 @ &=0x00ff00ff 966 and r1,r1,r9,lsl#8 @ &=0xff00ff00 967 orr r2,r2,r2,lsl#4 968 orr r3,r3,r3,lsr#4 969 orr r0,r0,r0,lsl#4 970 orr r1,r1,r1,lsr#4 971 and r2,r2,r8 @ &=0x0f0f0f0f 972 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0 973 and r0,r0,r8 @ &=0x0f0f0f0f 974 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 975 orr r2,r2,r2,lsl#2 976 orr r3,r3,r3,lsr#2 977 orr r0,r0,r0,lsl#2 978 orr r1,r1,r1,lsr#2 979 and r2,r2,r7 @ &=0x33333333 980 and r3,r3,r7,lsl#2 @ &=0xcccccccc 981 and r0,r0,r7 @ &=0x33333333 982 and r1,r1,r7,lsl#2 @ &=0xcccccccc 983 orr r2,r2,r2,lsl#1 984 orr r3,r3,r3,lsr#1 985 orr r0,r0,r0,lsl#1 986 orr r1,r1,r1,lsr#1 987 and r2,r2,r6 @ &=0x55555555 988 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa 989 and r0,r0,r6 @ &=0x55555555 990 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa 991 992 orr r2,r2,r3 993 orr r0,r0,r1 994 995 cmp $len,#8 996 blo .Lsqueeze_tail 997 lsr r1,r2,#8 998 strb r2,[$out],#1 999 lsr r3,r2,#16 1000 strb r1,[$out],#1 1001 lsr r2,r2,#24 1002 strb r3,[$out],#1 1003 strb r2,[$out],#1 1004 1005 lsr r1,r0,#8 1006 strb r0,[$out],#1 1007 lsr r3,r0,#16 1008 strb r1,[$out],#1 1009 lsr r0,r0,#24 1010 strb r3,[$out],#1 1011 strb r0,[$out],#1 1012 subs $len,$len,#8 1013 beq .Lsqueeze_done 1014 1015 subs $bsz,$bsz,#8 @ bsz -= 8 1016 bhi .Loop_squeeze 1017 1018 mov r0,r14 @ original $A_flat 1019 1020 bl KeccakF1600 1021 1022 ldmia sp,{r6-r10,r12} @ restore constants and variables 1023 mov r14,$A_flat 1024 b .Loop_squeeze 1025 1026.align 4 1027.Lsqueeze_tail: 1028 strb r2,[$out],#1 1029 lsr r2,r2,#8 1030 subs $len,$len,#1 1031 beq .Lsqueeze_done 1032 strb r2,[$out],#1 1033 lsr r2,r2,#8 1034 subs $len,$len,#1 1035 beq .Lsqueeze_done 1036 strb r2,[$out],#1 1037 lsr r2,r2,#8 1038 subs $len,$len,#1 1039 beq .Lsqueeze_done 1040 strb r2,[$out],#1 1041 subs $len,$len,#1 1042 beq .Lsqueeze_done 1043 1044 strb r0,[$out],#1 1045 lsr r0,r0,#8 1046 subs $len,$len,#1 1047 beq .Lsqueeze_done 1048 strb r0,[$out],#1 1049 lsr r0,r0,#8 1050 subs $len,$len,#1 1051 beq .Lsqueeze_done 1052 strb r0,[$out] 1053 b .Lsqueeze_done 1054 1055.align 4 1056.Lsqueeze_done: 1057 add sp,sp,#24 1058 ldmia sp!,{r4-r10,pc} 1059.size SHA3_squeeze,.-SHA3_squeeze 1060___ 1061} 1062 1063$code.=<<___; 1064#if __ARM_MAX_ARCH__>=7 1065.fpu neon 1066 1067.type iotas64, %object 1068.align 5 1069iotas64: 1070 .quad 0x0000000000000001 1071 .quad 0x0000000000008082 1072 .quad 0x800000000000808a 1073 .quad 0x8000000080008000 1074 .quad 0x000000000000808b 1075 .quad 0x0000000080000001 1076 .quad 0x8000000080008081 1077 .quad 0x8000000000008009 1078 .quad 0x000000000000008a 1079 .quad 0x0000000000000088 1080 .quad 0x0000000080008009 1081 .quad 0x000000008000000a 1082 .quad 0x000000008000808b 1083 .quad 0x800000000000008b 1084 .quad 0x8000000000008089 1085 .quad 0x8000000000008003 1086 .quad 0x8000000000008002 1087 .quad 0x8000000000000080 1088 .quad 0x000000000000800a 1089 .quad 0x800000008000000a 1090 .quad 0x8000000080008081 1091 .quad 0x8000000000008080 1092 .quad 0x0000000080000001 1093 .quad 0x8000000080008008 1094.size iotas64,.-iotas64 1095 1096.type KeccakF1600_neon, %function 1097.align 5 1098KeccakF1600_neon: 1099 add r1, r0, #16 1100 adr r2, iotas64 1101 mov r3, #24 @ loop counter 1102 b .Loop_neon 1103 1104.align 4 1105.Loop_neon: 1106 @ Theta 1107 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4] 1108 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0] 1109 vst1.64 {d18}, [r1,:64] @ offload A[2][4] 1110 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1] 1111 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2] 1112 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0] 1113 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1] 1114 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3] 1115 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4] 1116 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2] 1117 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3] 1118 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4] 1119 veor q13, q13, q10 @ C[0..1]^=A[4][0..1] 1120 veor q14, q15, q11 @ C[2..3]^=A[4][2..3] 1121 veor d25, d25, d24 @ C[4]^=A[4][4] 1122 1123 vadd.u64 q4, q13, q13 @ C[0..1]<<1 1124 vadd.u64 q15, q14, q14 @ C[2..3]<<1 1125 vadd.u64 d18, d25, d25 @ C[4]<<1 1126 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1) 1127 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1) 1128 vsri.u64 d18, d25, #63 @ ROL64(C[4],1) 1129 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1) 1130 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1) 1131 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1) 1132 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1) 1133 1134 veor d0, d0, d25 @ A[0][0] ^= C[4] 1135 veor d1, d1, d25 @ A[1][0] ^= C[4] 1136 veor d10, d10, d25 @ A[2][0] ^= C[4] 1137 veor d11, d11, d25 @ A[3][0] ^= C[4] 1138 veor d20, d20, d25 @ A[4][0] ^= C[4] 1139 1140 veor d2, d2, d26 @ A[0][1] ^= D[1] 1141 veor d3, d3, d26 @ A[1][1] ^= D[1] 1142 veor d12, d12, d26 @ A[2][1] ^= D[1] 1143 veor d13, d13, d26 @ A[3][1] ^= D[1] 1144 veor d21, d21, d26 @ A[4][1] ^= D[1] 1145 vmov d26, d27 1146 1147 veor d6, d6, d28 @ A[0][3] ^= C[2] 1148 veor d7, d7, d28 @ A[1][3] ^= C[2] 1149 veor d16, d16, d28 @ A[2][3] ^= C[2] 1150 veor d17, d17, d28 @ A[3][3] ^= C[2] 1151 veor d23, d23, d28 @ A[4][3] ^= C[2] 1152 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4] 1153 vmov d28, d29 1154 1155 vld1.64 {d18}, [r1,:64] @ restore A[2][4] 1156 veor q2, q2, q13 @ A[0..1][2] ^= D[2] 1157 veor q7, q7, q13 @ A[2..3][2] ^= D[2] 1158 veor d22, d22, d27 @ A[4][2] ^= D[2] 1159 1160 veor q4, q4, q14 @ A[0..1][4] ^= C[3] 1161 veor q9, q9, q14 @ A[2..3][4] ^= C[3] 1162 veor d24, d24, d29 @ A[4][4] ^= C[3] 1163 1164 @ Rho + Pi 1165 vmov d26, d2 @ C[1] = A[0][1] 1166 vshl.u64 d2, d3, #44 1167 vmov d27, d4 @ C[2] = A[0][2] 1168 vshl.u64 d4, d14, #43 1169 vmov d28, d6 @ C[3] = A[0][3] 1170 vshl.u64 d6, d17, #21 1171 vmov d29, d8 @ C[4] = A[0][4] 1172 vshl.u64 d8, d24, #14 1173 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1]) 1174 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2]) 1175 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3]) 1176 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4]) 1177 1178 vshl.u64 d3, d9, #20 1179 vshl.u64 d14, d16, #25 1180 vshl.u64 d17, d15, #15 1181 vshl.u64 d24, d21, #2 1182 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4]) 1183 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3]) 1184 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2]) 1185 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1]) 1186 1187 vshl.u64 d9, d22, #61 1188 @ vshl.u64 d16, d19, #8 1189 vshl.u64 d15, d12, #10 1190 vshl.u64 d21, d7, #55 1191 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2]) 1192 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4]) 1193 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1]) 1194 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3]) 1195 1196 vshl.u64 d22, d18, #39 1197 @ vshl.u64 d19, d23, #56 1198 vshl.u64 d12, d5, #6 1199 vshl.u64 d7, d13, #45 1200 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4]) 1201 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3]) 1202 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2]) 1203 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1]) 1204 1205 vshl.u64 d18, d20, #18 1206 vshl.u64 d23, d11, #41 1207 vshl.u64 d5, d10, #3 1208 vshl.u64 d13, d1, #36 1209 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0]) 1210 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0]) 1211 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0]) 1212 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0]) 1213 1214 vshl.u64 d1, d28, #28 1215 vshl.u64 d10, d26, #1 1216 vshl.u64 d11, d29, #27 1217 vshl.u64 d20, d27, #62 1218 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3]) 1219 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1]) 1220 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4]) 1221 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2]) 1222 1223 @ Chi + Iota 1224 vbic q13, q2, q1 1225 vbic q14, q3, q2 1226 vbic q15, q4, q3 1227 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2]) 1228 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3]) 1229 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) 1230 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0] 1231 vbic q13, q0, q4 1232 vbic q15, q1, q0 1233 vmov q1, q14 @ A[0..1][1] 1234 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) 1235 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) 1236 1237 vbic q13, q7, q6 1238 vmov q0, q5 @ A[2..3][0] 1239 vbic q14, q8, q7 1240 vmov q15, q6 @ A[2..3][1] 1241 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) 1242 vbic q13, q9, q8 1243 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) 1244 vbic q14, q0, q9 1245 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) 1246 vbic q13, q15, q0 1247 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) 1248 vmov q14, q10 @ A[4][0..1] 1249 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) 1250 1251 vld1.64 d25, [r2,:64]! @ Iota[i++] 1252 vbic d26, d22, d21 1253 vbic d27, d23, d22 1254 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0] 1255 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2]) 1256 vbic d26, d24, d23 1257 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3]) 1258 vbic d27, d28, d24 1259 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4]) 1260 vbic d26, d29, d28 1261 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0]) 1262 veor d0, d0, d25 @ A[0][0] ^= Iota[i] 1263 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1]) 1264 1265 subs r3, r3, #1 1266 bne .Loop_neon 1267 1268 bx lr 1269.size KeccakF1600_neon,.-KeccakF1600_neon 1270 1271.global SHA3_absorb_neon 1272.type SHA3_absorb_neon, %function 1273.align 5 1274SHA3_absorb_neon: 1275 stmdb sp!, {r4-r6,lr} 1276 vstmdb sp!, {d8-d15} 1277 1278 mov r4, r1 @ inp 1279 mov r5, r2 @ len 1280 mov r6, r3 @ bsz 1281 1282 vld1.32 {d0}, [r0,:64]! @ A[0][0] 1283 vld1.32 {d2}, [r0,:64]! @ A[0][1] 1284 vld1.32 {d4}, [r0,:64]! @ A[0][2] 1285 vld1.32 {d6}, [r0,:64]! @ A[0][3] 1286 vld1.32 {d8}, [r0,:64]! @ A[0][4] 1287 1288 vld1.32 {d1}, [r0,:64]! @ A[1][0] 1289 vld1.32 {d3}, [r0,:64]! @ A[1][1] 1290 vld1.32 {d5}, [r0,:64]! @ A[1][2] 1291 vld1.32 {d7}, [r0,:64]! @ A[1][3] 1292 vld1.32 {d9}, [r0,:64]! @ A[1][4] 1293 1294 vld1.32 {d10}, [r0,:64]! @ A[2][0] 1295 vld1.32 {d12}, [r0,:64]! @ A[2][1] 1296 vld1.32 {d14}, [r0,:64]! @ A[2][2] 1297 vld1.32 {d16}, [r0,:64]! @ A[2][3] 1298 vld1.32 {d18}, [r0,:64]! @ A[2][4] 1299 1300 vld1.32 {d11}, [r0,:64]! @ A[3][0] 1301 vld1.32 {d13}, [r0,:64]! @ A[3][1] 1302 vld1.32 {d15}, [r0,:64]! @ A[3][2] 1303 vld1.32 {d17}, [r0,:64]! @ A[3][3] 1304 vld1.32 {d19}, [r0,:64]! @ A[3][4] 1305 1306 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3] 1307 vld1.32 {d24}, [r0,:64] @ A[4][4] 1308 sub r0, r0, #24*8 @ rewind 1309 b .Loop_absorb_neon 1310 1311.align 4 1312.Loop_absorb_neon: 1313 subs r12, r5, r6 @ len - bsz 1314 blo .Labsorbed_neon 1315 mov r5, r12 1316 1317 vld1.8 {d31}, [r4]! @ endian-neutral loads... 1318 cmp r6, #8*2 1319 veor d0, d0, d31 @ A[0][0] ^= *inp++ 1320 blo .Lprocess_neon 1321 vld1.8 {d31}, [r4]! 1322 veor d2, d2, d31 @ A[0][1] ^= *inp++ 1323 beq .Lprocess_neon 1324 vld1.8 {d31}, [r4]! 1325 cmp r6, #8*4 1326 veor d4, d4, d31 @ A[0][2] ^= *inp++ 1327 blo .Lprocess_neon 1328 vld1.8 {d31}, [r4]! 1329 veor d6, d6, d31 @ A[0][3] ^= *inp++ 1330 beq .Lprocess_neon 1331 vld1.8 {d31},[r4]! 1332 cmp r6, #8*6 1333 veor d8, d8, d31 @ A[0][4] ^= *inp++ 1334 blo .Lprocess_neon 1335 1336 vld1.8 {d31}, [r4]! 1337 veor d1, d1, d31 @ A[1][0] ^= *inp++ 1338 beq .Lprocess_neon 1339 vld1.8 {d31}, [r4]! 1340 cmp r6, #8*8 1341 veor d3, d3, d31 @ A[1][1] ^= *inp++ 1342 blo .Lprocess_neon 1343 vld1.8 {d31}, [r4]! 1344 veor d5, d5, d31 @ A[1][2] ^= *inp++ 1345 beq .Lprocess_neon 1346 vld1.8 {d31}, [r4]! 1347 cmp r6, #8*10 1348 veor d7, d7, d31 @ A[1][3] ^= *inp++ 1349 blo .Lprocess_neon 1350 vld1.8 {d31}, [r4]! 1351 veor d9, d9, d31 @ A[1][4] ^= *inp++ 1352 beq .Lprocess_neon 1353 1354 vld1.8 {d31}, [r4]! 1355 cmp r6, #8*12 1356 veor d10, d10, d31 @ A[2][0] ^= *inp++ 1357 blo .Lprocess_neon 1358 vld1.8 {d31}, [r4]! 1359 veor d12, d12, d31 @ A[2][1] ^= *inp++ 1360 beq .Lprocess_neon 1361 vld1.8 {d31}, [r4]! 1362 cmp r6, #8*14 1363 veor d14, d14, d31 @ A[2][2] ^= *inp++ 1364 blo .Lprocess_neon 1365 vld1.8 {d31}, [r4]! 1366 veor d16, d16, d31 @ A[2][3] ^= *inp++ 1367 beq .Lprocess_neon 1368 vld1.8 {d31}, [r4]! 1369 cmp r6, #8*16 1370 veor d18, d18, d31 @ A[2][4] ^= *inp++ 1371 blo .Lprocess_neon 1372 1373 vld1.8 {d31}, [r4]! 1374 veor d11, d11, d31 @ A[3][0] ^= *inp++ 1375 beq .Lprocess_neon 1376 vld1.8 {d31}, [r4]! 1377 cmp r6, #8*18 1378 veor d13, d13, d31 @ A[3][1] ^= *inp++ 1379 blo .Lprocess_neon 1380 vld1.8 {d31}, [r4]! 1381 veor d15, d15, d31 @ A[3][2] ^= *inp++ 1382 beq .Lprocess_neon 1383 vld1.8 {d31}, [r4]! 1384 cmp r6, #8*20 1385 veor d17, d17, d31 @ A[3][3] ^= *inp++ 1386 blo .Lprocess_neon 1387 vld1.8 {d31}, [r4]! 1388 veor d19, d19, d31 @ A[3][4] ^= *inp++ 1389 beq .Lprocess_neon 1390 1391 vld1.8 {d31}, [r4]! 1392 cmp r6, #8*22 1393 veor d20, d20, d31 @ A[4][0] ^= *inp++ 1394 blo .Lprocess_neon 1395 vld1.8 {d31}, [r4]! 1396 veor d21, d21, d31 @ A[4][1] ^= *inp++ 1397 beq .Lprocess_neon 1398 vld1.8 {d31}, [r4]! 1399 cmp r6, #8*24 1400 veor d22, d22, d31 @ A[4][2] ^= *inp++ 1401 blo .Lprocess_neon 1402 vld1.8 {d31}, [r4]! 1403 veor d23, d23, d31 @ A[4][3] ^= *inp++ 1404 beq .Lprocess_neon 1405 vld1.8 {d31}, [r4]! 1406 veor d24, d24, d31 @ A[4][4] ^= *inp++ 1407 1408.Lprocess_neon: 1409 bl KeccakF1600_neon 1410 b .Loop_absorb_neon 1411 1412.align 4 1413.Labsorbed_neon: 1414 vst1.32 {d0}, [r0,:64]! @ A[0][0..4] 1415 vst1.32 {d2}, [r0,:64]! 1416 vst1.32 {d4}, [r0,:64]! 1417 vst1.32 {d6}, [r0,:64]! 1418 vst1.32 {d8}, [r0,:64]! 1419 1420 vst1.32 {d1}, [r0,:64]! @ A[1][0..4] 1421 vst1.32 {d3}, [r0,:64]! 1422 vst1.32 {d5}, [r0,:64]! 1423 vst1.32 {d7}, [r0,:64]! 1424 vst1.32 {d9}, [r0,:64]! 1425 1426 vst1.32 {d10}, [r0,:64]! @ A[2][0..4] 1427 vst1.32 {d12}, [r0,:64]! 1428 vst1.32 {d14}, [r0,:64]! 1429 vst1.32 {d16}, [r0,:64]! 1430 vst1.32 {d18}, [r0,:64]! 1431 1432 vst1.32 {d11}, [r0,:64]! @ A[3][0..4] 1433 vst1.32 {d13}, [r0,:64]! 1434 vst1.32 {d15}, [r0,:64]! 1435 vst1.32 {d17}, [r0,:64]! 1436 vst1.32 {d19}, [r0,:64]! 1437 1438 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1439 vst1.32 {d24}, [r0,:64] 1440 1441 mov r0, r5 @ return value 1442 vldmia sp!, {d8-d15} 1443 ldmia sp!, {r4-r6,pc} 1444.size SHA3_absorb_neon,.-SHA3_absorb_neon 1445 1446.global SHA3_squeeze_neon 1447.type SHA3_squeeze_neon, %function 1448.align 5 1449SHA3_squeeze_neon: 1450 stmdb sp!, {r4-r6,lr} 1451 1452 mov r4, r1 @ out 1453 mov r5, r2 @ len 1454 mov r6, r3 @ bsz 1455 mov r12, r0 @ A_flat 1456 mov r14, r3 @ bsz 1457 b .Loop_squeeze_neon 1458 1459.align 4 1460.Loop_squeeze_neon: 1461 cmp r5, #8 1462 blo .Lsqueeze_neon_tail 1463 vld1.32 {d0}, [r12]! 1464 vst1.8 {d0}, [r4]! @ endian-neutral store 1465 1466 subs r5, r5, #8 @ len -= 8 1467 beq .Lsqueeze_neon_done 1468 1469 subs r14, r14, #8 @ bsz -= 8 1470 bhi .Loop_squeeze_neon 1471 1472 vstmdb sp!, {d8-d15} 1473 1474 vld1.32 {d0}, [r0,:64]! @ A[0][0..4] 1475 vld1.32 {d2}, [r0,:64]! 1476 vld1.32 {d4}, [r0,:64]! 1477 vld1.32 {d6}, [r0,:64]! 1478 vld1.32 {d8}, [r0,:64]! 1479 1480 vld1.32 {d1}, [r0,:64]! @ A[1][0..4] 1481 vld1.32 {d3}, [r0,:64]! 1482 vld1.32 {d5}, [r0,:64]! 1483 vld1.32 {d7}, [r0,:64]! 1484 vld1.32 {d9}, [r0,:64]! 1485 1486 vld1.32 {d10}, [r0,:64]! @ A[2][0..4] 1487 vld1.32 {d12}, [r0,:64]! 1488 vld1.32 {d14}, [r0,:64]! 1489 vld1.32 {d16}, [r0,:64]! 1490 vld1.32 {d18}, [r0,:64]! 1491 1492 vld1.32 {d11}, [r0,:64]! @ A[3][0..4] 1493 vld1.32 {d13}, [r0,:64]! 1494 vld1.32 {d15}, [r0,:64]! 1495 vld1.32 {d17}, [r0,:64]! 1496 vld1.32 {d19}, [r0,:64]! 1497 1498 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1499 vld1.32 {d24}, [r0,:64] 1500 sub r0, r0, #24*8 @ rewind 1501 1502 bl KeccakF1600_neon 1503 1504 mov r12, r0 @ A_flat 1505 vst1.32 {d0}, [r0,:64]! @ A[0][0..4] 1506 vst1.32 {d2}, [r0,:64]! 1507 vst1.32 {d4}, [r0,:64]! 1508 vst1.32 {d6}, [r0,:64]! 1509 vst1.32 {d8}, [r0,:64]! 1510 1511 vst1.32 {d1}, [r0,:64]! @ A[1][0..4] 1512 vst1.32 {d3}, [r0,:64]! 1513 vst1.32 {d5}, [r0,:64]! 1514 vst1.32 {d7}, [r0,:64]! 1515 vst1.32 {d9}, [r0,:64]! 1516 1517 vst1.32 {d10}, [r0,:64]! @ A[2][0..4] 1518 vst1.32 {d12}, [r0,:64]! 1519 vst1.32 {d14}, [r0,:64]! 1520 vst1.32 {d16}, [r0,:64]! 1521 vst1.32 {d18}, [r0,:64]! 1522 1523 vst1.32 {d11}, [r0,:64]! @ A[3][0..4] 1524 vst1.32 {d13}, [r0,:64]! 1525 vst1.32 {d15}, [r0,:64]! 1526 vst1.32 {d17}, [r0,:64]! 1527 vst1.32 {d19}, [r0,:64]! 1528 1529 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1530 mov r14, r6 @ bsz 1531 vst1.32 {d24}, [r0,:64] 1532 mov r0, r12 @ rewind 1533 1534 vldmia sp!, {d8-d15} 1535 b .Loop_squeeze_neon 1536 1537.align 4 1538.Lsqueeze_neon_tail: 1539 ldmia r12, {r2,r3} 1540 cmp r5, #2 1541 strb r2, [r4],#1 @ endian-neutral store 1542 lsr r2, r2, #8 1543 blo .Lsqueeze_neon_done 1544 strb r2, [r4], #1 1545 lsr r2, r2, #8 1546 beq .Lsqueeze_neon_done 1547 strb r2, [r4], #1 1548 lsr r2, r2, #8 1549 cmp r5, #4 1550 blo .Lsqueeze_neon_done 1551 strb r2, [r4], #1 1552 beq .Lsqueeze_neon_done 1553 1554 strb r3, [r4], #1 1555 lsr r3, r3, #8 1556 cmp r5, #6 1557 blo .Lsqueeze_neon_done 1558 strb r3, [r4], #1 1559 lsr r3, r3, #8 1560 beq .Lsqueeze_neon_done 1561 strb r3, [r4], #1 1562 1563.Lsqueeze_neon_done: 1564 ldmia sp!, {r4-r6,pc} 1565.size SHA3_squeeze_neon,.-SHA3_squeeze_neon 1566#endif 1567.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 1568.align 2 1569___ 1570 1571{ 1572 my %ldr, %str; 1573 1574 sub ldrd { 1575 my ($mnemonic,$half,$reg,$ea) = @_; 1576 my $op = $mnemonic eq "ldr" ? \%ldr : \%str; 1577 1578 if ($half eq "l") { 1579 $$op{reg} = $reg; 1580 $$op{ea} = $ea; 1581 sprintf "#ifndef __thumb2__\n" . 1582 " %s\t%s,%s\n" . 1583 "#endif", $mnemonic,$reg,$ea; 1584 } else { 1585 sprintf "#ifndef __thumb2__\n" . 1586 " %s\t%s,%s\n" . 1587 "#else\n" . 1588 " %sd\t%s,%s,%s\n" . 1589 "#endif", $mnemonic,$reg,$ea, 1590 $mnemonic,$$op{reg},$reg,$$op{ea}; 1591 } 1592 } 1593} 1594 1595foreach (split($/,$code)) { 1596 s/\`([^\`]*)\`/eval $1/ge; 1597 1598 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or 1599 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or 1600 s/\bret\b/bx lr/g or 1601 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 1602 1603 print $_,"\n"; 1604} 1605 1606close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1607