1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2014 18# 19# ChaCha20 for ARMv4. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 24# 25# Cortex-A5 19.3(*)/+95% 21.8 14.1 26# Cortex-A8 10.5(*)/+160% 13.9 6.35 27# Cortex-A9 12.9(**)/+110% 14.3 6.50 28# Cortex-A15 11.0/+40% 16.0 5.00 29# Snapdragon S4 11.5/+125% 13.6 4.90 30# 31# (*) most "favourable" result for aligned data on little-endian 32# processor, result for misaligned data is 10-15% lower; 33# (**) this result is a trade-off: it can be improved by 20%, 34# but then Snapdragon S4 and Cortex-A8 results get 35# 20-25% worse; 36 37$flavour = shift; 38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 40 41if ($flavour && $flavour ne "void") { 42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45 die "can't locate arm-xlate.pl"; 46 47 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 48} else { 49 open STDOUT,">$output"; 50} 51 52sub AUTOLOAD() # thunk [simplified] x86-style perlasm 53{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 54 my $arg = pop; 55 $arg = "#$arg" if ($arg*1 eq $arg); 56 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 57} 58 59my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 60my @t=map("r$_",(8..11)); 61 62sub ROUND { 63my ($a0,$b0,$c0,$d0)=@_; 64my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 65my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 66my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 67my $odd = $d0&1; 68my ($xc,$xc_) = (@t[0..1]); 69my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 70my @ret; 71 72 # Consider order in which variables are addressed by their 73 # index: 74 # 75 # a b c d 76 # 77 # 0 4 8 12 < even round 78 # 1 5 9 13 79 # 2 6 10 14 80 # 3 7 11 15 81 # 0 5 10 15 < odd round 82 # 1 6 11 12 83 # 2 7 8 13 84 # 3 4 9 14 85 # 86 # 'a', 'b' are permanently allocated in registers, @x[0..7], 87 # while 'c's and pair of 'd's are maintained in memory. If 88 # you observe 'c' column, you'll notice that pair of 'c's is 89 # invariant between rounds. This means that we have to reload 90 # them once per round, in the middle. This is why you'll see 91 # bunch of 'c' stores and loads in the middle, but none in 92 # the beginning or end. If you observe 'd' column, you'll 93 # notice that 15 and 13 are reused in next pair of rounds. 94 # This is why these two are chosen for offloading to memory, 95 # to make loads count more. 96 push @ret,( 97 "&add (@x[$a0],@x[$a0],@x[$b0])", 98 "&mov ($xd,$xd,'ror#16')", 99 "&add (@x[$a1],@x[$a1],@x[$b1])", 100 "&mov ($xd_,$xd_,'ror#16')", 101 "&eor ($xd,$xd,@x[$a0],'ror#16')", 102 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 103 104 "&add ($xc,$xc,$xd)", 105 "&mov (@x[$b0],@x[$b0],'ror#20')", 106 "&add ($xc_,$xc_,$xd_)", 107 "&mov (@x[$b1],@x[$b1],'ror#20')", 108 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 109 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 110 111 "&add (@x[$a0],@x[$a0],@x[$b0])", 112 "&mov ($xd,$xd,'ror#24')", 113 "&add (@x[$a1],@x[$a1],@x[$b1])", 114 "&mov ($xd_,$xd_,'ror#24')", 115 "&eor ($xd,$xd,@x[$a0],'ror#24')", 116 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 117 118 "&add ($xc,$xc,$xd)", 119 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 120 push @ret,( 121 "&str ($xd,'[sp,#4*(16+$d0)]')", 122 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 123 push @ret,( 124 "&add ($xc_,$xc_,$xd_)", 125 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 126 push @ret,( 127 "&str ($xd_,'[sp,#4*(16+$d1)]')", 128 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 129 push @ret,( 130 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 131 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 132 133 $xd=@x[$d2] if (!$odd); 134 $xd_=@x[$d3] if ($odd); 135 push @ret,( 136 "&str ($xc,'[sp,#4*(16+$c0)]')", 137 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 138 "&add (@x[$a2],@x[$a2],@x[$b2])", 139 "&mov ($xd,$xd,'ror#16')", 140 "&str ($xc_,'[sp,#4*(16+$c1)]')", 141 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 142 "&add (@x[$a3],@x[$a3],@x[$b3])", 143 "&mov ($xd_,$xd_,'ror#16')", 144 "&eor ($xd,$xd,@x[$a2],'ror#16')", 145 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 146 147 "&add ($xc,$xc,$xd)", 148 "&mov (@x[$b2],@x[$b2],'ror#20')", 149 "&add ($xc_,$xc_,$xd_)", 150 "&mov (@x[$b3],@x[$b3],'ror#20')", 151 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 152 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 153 154 "&add (@x[$a2],@x[$a2],@x[$b2])", 155 "&mov ($xd,$xd,'ror#24')", 156 "&add (@x[$a3],@x[$a3],@x[$b3])", 157 "&mov ($xd_,$xd_,'ror#24')", 158 "&eor ($xd,$xd,@x[$a2],'ror#24')", 159 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 160 161 "&add ($xc,$xc,$xd)", 162 "&mov (@x[$b2],@x[$b2],'ror#25')", 163 "&add ($xc_,$xc_,$xd_)", 164 "&mov (@x[$b3],@x[$b3],'ror#25')", 165 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 166 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 167 168 @ret; 169} 170 171$code.=<<___; 172#include <openssl/arm_arch.h> 173 174.text 175#if defined(__thumb2__) || defined(__clang__) 176.syntax unified 177#endif 178#if defined(__thumb2__) 179.thumb 180#else 181.code 32 182#endif 183 184#if defined(__thumb2__) || defined(__clang__) 185#define ldrhsb ldrbhs 186#endif 187 188.align 5 189.Lsigma: 190.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 191.Lone: 192.long 1,0,0,0 193#if __ARM_MAX_ARCH__>=7 194.LOPENSSL_armcap: 195.word OPENSSL_armcap_P-.LChaCha20_ctr32 196#else 197.word -1 198#endif 199 200.globl ChaCha20_ctr32 201.type ChaCha20_ctr32,%function 202.align 5 203ChaCha20_ctr32: 204.LChaCha20_ctr32: 205 ldr r12,[sp,#0] @ pull pointer to counter and nonce 206 stmdb sp!,{r0-r2,r4-r11,lr} 207#if __ARM_ARCH__<7 && !defined(__thumb2__) 208 sub r14,pc,#16 @ ChaCha20_ctr32 209#else 210 adr r14,.LChaCha20_ctr32 211#endif 212 cmp r2,#0 @ len==0? 213#ifdef __thumb2__ 214 itt eq 215#endif 216 addeq sp,sp,#4*3 217 beq .Lno_data 218#if __ARM_MAX_ARCH__>=7 219 cmp r2,#192 @ test len 220 bls .Lshort 221 ldr r4,[r14,#-32] 222 ldr r4,[r14,r4] 223# ifdef __APPLE__ 224 ldr r4,[r4] 225# endif 226 tst r4,#ARMV7_NEON 227 bne .LChaCha20_neon 228.Lshort: 229#endif 230 ldmia r12,{r4-r7} @ load counter and nonce 231 sub sp,sp,#4*(16) @ off-load area 232 sub r14,r14,#64 @ .Lsigma 233 stmdb sp!,{r4-r7} @ copy counter and nonce 234 ldmia r3,{r4-r11} @ load key 235 ldmia r14,{r0-r3} @ load sigma 236 stmdb sp!,{r4-r11} @ copy key 237 stmdb sp!,{r0-r3} @ copy sigma 238 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 239 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 240 b .Loop_outer_enter 241 242.align 4 243.Loop_outer: 244 ldmia sp,{r0-r9} @ load key material 245 str @t[3],[sp,#4*(32+2)] @ save len 246 str r12, [sp,#4*(32+1)] @ save inp 247 str r14, [sp,#4*(32+0)] @ save out 248.Loop_outer_enter: 249 ldr @t[3], [sp,#4*(15)] 250 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 251 ldr @t[2], [sp,#4*(13)] 252 ldr @x[14],[sp,#4*(14)] 253 str @t[3], [sp,#4*(16+15)] 254 mov @t[3],#10 255 b .Loop 256 257.align 4 258.Loop: 259 subs @t[3],@t[3],#1 260___ 261 foreach (&ROUND(0, 4, 8,12)) { eval; } 262 foreach (&ROUND(0, 5,10,15)) { eval; } 263$code.=<<___; 264 bne .Loop 265 266 ldr @t[3],[sp,#4*(32+2)] @ load len 267 268 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 269 str @t[1], [sp,#4*(16+9)] 270 str @x[12],[sp,#4*(16+12)] 271 str @t[2], [sp,#4*(16+13)] 272 str @x[14],[sp,#4*(16+14)] 273 274 @ at this point we have first half of 512-bit result in 275 @ @x[0-7] and second half at sp+4*(16+8) 276 277 cmp @t[3],#64 @ done yet? 278#ifdef __thumb2__ 279 itete lo 280#endif 281 addlo r12,sp,#4*(0) @ shortcut or ... 282 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 283 addlo r14,sp,#4*(0) @ shortcut or ... 284 ldrhs r14,[sp,#4*(32+0)] @ ... load out 285 286 ldr @t[0],[sp,#4*(0)] @ load key material 287 ldr @t[1],[sp,#4*(1)] 288 289#if __ARM_ARCH__>=6 || !defined(__ARMEB__) 290# if __ARM_ARCH__<7 291 orr @t[2],r12,r14 292 tst @t[2],#3 @ are input and output aligned? 293 ldr @t[2],[sp,#4*(2)] 294 bne .Lunaligned 295 cmp @t[3],#64 @ restore flags 296# else 297 ldr @t[2],[sp,#4*(2)] 298# endif 299 ldr @t[3],[sp,#4*(3)] 300 301 add @x[0],@x[0],@t[0] @ accumulate key material 302 add @x[1],@x[1],@t[1] 303# ifdef __thumb2__ 304 itt hs 305# endif 306 ldrhs @t[0],[r12],#16 @ load input 307 ldrhs @t[1],[r12,#-12] 308 309 add @x[2],@x[2],@t[2] 310 add @x[3],@x[3],@t[3] 311# ifdef __thumb2__ 312 itt hs 313# endif 314 ldrhs @t[2],[r12,#-8] 315 ldrhs @t[3],[r12,#-4] 316# if __ARM_ARCH__>=6 && defined(__ARMEB__) 317 rev @x[0],@x[0] 318 rev @x[1],@x[1] 319 rev @x[2],@x[2] 320 rev @x[3],@x[3] 321# endif 322# ifdef __thumb2__ 323 itt hs 324# endif 325 eorhs @x[0],@x[0],@t[0] @ xor with input 326 eorhs @x[1],@x[1],@t[1] 327 add @t[0],sp,#4*(4) 328 str @x[0],[r14],#16 @ store output 329# ifdef __thumb2__ 330 itt hs 331# endif 332 eorhs @x[2],@x[2],@t[2] 333 eorhs @x[3],@x[3],@t[3] 334 ldmia @t[0],{@t[0]-@t[3]} @ load key material 335 str @x[1],[r14,#-12] 336 str @x[2],[r14,#-8] 337 str @x[3],[r14,#-4] 338 339 add @x[4],@x[4],@t[0] @ accumulate key material 340 add @x[5],@x[5],@t[1] 341# ifdef __thumb2__ 342 itt hs 343# endif 344 ldrhs @t[0],[r12],#16 @ load input 345 ldrhs @t[1],[r12,#-12] 346 add @x[6],@x[6],@t[2] 347 add @x[7],@x[7],@t[3] 348# ifdef __thumb2__ 349 itt hs 350# endif 351 ldrhs @t[2],[r12,#-8] 352 ldrhs @t[3],[r12,#-4] 353# if __ARM_ARCH__>=6 && defined(__ARMEB__) 354 rev @x[4],@x[4] 355 rev @x[5],@x[5] 356 rev @x[6],@x[6] 357 rev @x[7],@x[7] 358# endif 359# ifdef __thumb2__ 360 itt hs 361# endif 362 eorhs @x[4],@x[4],@t[0] 363 eorhs @x[5],@x[5],@t[1] 364 add @t[0],sp,#4*(8) 365 str @x[4],[r14],#16 @ store output 366# ifdef __thumb2__ 367 itt hs 368# endif 369 eorhs @x[6],@x[6],@t[2] 370 eorhs @x[7],@x[7],@t[3] 371 str @x[5],[r14,#-12] 372 ldmia @t[0],{@t[0]-@t[3]} @ load key material 373 str @x[6],[r14,#-8] 374 add @x[0],sp,#4*(16+8) 375 str @x[7],[r14,#-4] 376 377 ldmia @x[0],{@x[0]-@x[7]} @ load second half 378 379 add @x[0],@x[0],@t[0] @ accumulate key material 380 add @x[1],@x[1],@t[1] 381# ifdef __thumb2__ 382 itt hs 383# endif 384 ldrhs @t[0],[r12],#16 @ load input 385 ldrhs @t[1],[r12,#-12] 386# ifdef __thumb2__ 387 itt hi 388# endif 389 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 390 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 391 add @x[2],@x[2],@t[2] 392 add @x[3],@x[3],@t[3] 393# ifdef __thumb2__ 394 itt hs 395# endif 396 ldrhs @t[2],[r12,#-8] 397 ldrhs @t[3],[r12,#-4] 398# if __ARM_ARCH__>=6 && defined(__ARMEB__) 399 rev @x[0],@x[0] 400 rev @x[1],@x[1] 401 rev @x[2],@x[2] 402 rev @x[3],@x[3] 403# endif 404# ifdef __thumb2__ 405 itt hs 406# endif 407 eorhs @x[0],@x[0],@t[0] 408 eorhs @x[1],@x[1],@t[1] 409 add @t[0],sp,#4*(12) 410 str @x[0],[r14],#16 @ store output 411# ifdef __thumb2__ 412 itt hs 413# endif 414 eorhs @x[2],@x[2],@t[2] 415 eorhs @x[3],@x[3],@t[3] 416 str @x[1],[r14,#-12] 417 ldmia @t[0],{@t[0]-@t[3]} @ load key material 418 str @x[2],[r14,#-8] 419 str @x[3],[r14,#-4] 420 421 add @x[4],@x[4],@t[0] @ accumulate key material 422 add @x[5],@x[5],@t[1] 423# ifdef __thumb2__ 424 itt hi 425# endif 426 addhi @t[0],@t[0],#1 @ next counter value 427 strhi @t[0],[sp,#4*(12)] @ save next counter value 428# ifdef __thumb2__ 429 itt hs 430# endif 431 ldrhs @t[0],[r12],#16 @ load input 432 ldrhs @t[1],[r12,#-12] 433 add @x[6],@x[6],@t[2] 434 add @x[7],@x[7],@t[3] 435# ifdef __thumb2__ 436 itt hs 437# endif 438 ldrhs @t[2],[r12,#-8] 439 ldrhs @t[3],[r12,#-4] 440# if __ARM_ARCH__>=6 && defined(__ARMEB__) 441 rev @x[4],@x[4] 442 rev @x[5],@x[5] 443 rev @x[6],@x[6] 444 rev @x[7],@x[7] 445# endif 446# ifdef __thumb2__ 447 itt hs 448# endif 449 eorhs @x[4],@x[4],@t[0] 450 eorhs @x[5],@x[5],@t[1] 451# ifdef __thumb2__ 452 it ne 453# endif 454 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 455# ifdef __thumb2__ 456 itt hs 457# endif 458 eorhs @x[6],@x[6],@t[2] 459 eorhs @x[7],@x[7],@t[3] 460 str @x[4],[r14],#16 @ store output 461 str @x[5],[r14,#-12] 462# ifdef __thumb2__ 463 it hs 464# endif 465 subhs @t[3],@t[0],#64 @ len-=64 466 str @x[6],[r14,#-8] 467 str @x[7],[r14,#-4] 468 bhi .Loop_outer 469 470 beq .Ldone 471# if __ARM_ARCH__<7 472 b .Ltail 473 474.align 4 475.Lunaligned: @ unaligned endian-neutral path 476 cmp @t[3],#64 @ restore flags 477# endif 478#endif 479#if __ARM_ARCH__<7 480 ldr @t[3],[sp,#4*(3)] 481___ 482for ($i=0;$i<16;$i+=4) { 483my $j=$i&0x7; 484 485$code.=<<___ if ($i==4); 486 add @x[0],sp,#4*(16+8) 487___ 488$code.=<<___ if ($i==8); 489 ldmia @x[0],{@x[0]-@x[7]} @ load second half 490# ifdef __thumb2__ 491 itt hi 492# endif 493 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 494 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 495___ 496$code.=<<___; 497 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 498___ 499$code.=<<___ if ($i==12); 500# ifdef __thumb2__ 501 itt hi 502# endif 503 addhi @t[0],@t[0],#1 @ next counter value 504 strhi @t[0],[sp,#4*(12)] @ save next counter value 505___ 506$code.=<<___; 507 add @x[$j+1],@x[$j+1],@t[1] 508 add @x[$j+2],@x[$j+2],@t[2] 509# ifdef __thumb2__ 510 itete lo 511# endif 512 eorlo @t[0],@t[0],@t[0] @ zero or ... 513 ldrhsb @t[0],[r12],#16 @ ... load input 514 eorlo @t[1],@t[1],@t[1] 515 ldrhsb @t[1],[r12,#-12] 516 517 add @x[$j+3],@x[$j+3],@t[3] 518# ifdef __thumb2__ 519 itete lo 520# endif 521 eorlo @t[2],@t[2],@t[2] 522 ldrhsb @t[2],[r12,#-8] 523 eorlo @t[3],@t[3],@t[3] 524 ldrhsb @t[3],[r12,#-4] 525 526 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 527 eor @x[$j+1],@t[1],@x[$j+1] 528# ifdef __thumb2__ 529 itt hs 530# endif 531 ldrhsb @t[0],[r12,#-15] @ load more input 532 ldrhsb @t[1],[r12,#-11] 533 eor @x[$j+2],@t[2],@x[$j+2] 534 strb @x[$j+0],[r14],#16 @ store output 535 eor @x[$j+3],@t[3],@x[$j+3] 536# ifdef __thumb2__ 537 itt hs 538# endif 539 ldrhsb @t[2],[r12,#-7] 540 ldrhsb @t[3],[r12,#-3] 541 strb @x[$j+1],[r14,#-12] 542 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 543 strb @x[$j+2],[r14,#-8] 544 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 545# ifdef __thumb2__ 546 itt hs 547# endif 548 ldrhsb @t[0],[r12,#-14] @ load more input 549 ldrhsb @t[1],[r12,#-10] 550 strb @x[$j+3],[r14,#-4] 551 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 552 strb @x[$j+0],[r14,#-15] 553 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 554# ifdef __thumb2__ 555 itt hs 556# endif 557 ldrhsb @t[2],[r12,#-6] 558 ldrhsb @t[3],[r12,#-2] 559 strb @x[$j+1],[r14,#-11] 560 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 561 strb @x[$j+2],[r14,#-7] 562 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 563# ifdef __thumb2__ 564 itt hs 565# endif 566 ldrhsb @t[0],[r12,#-13] @ load more input 567 ldrhsb @t[1],[r12,#-9] 568 strb @x[$j+3],[r14,#-3] 569 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 570 strb @x[$j+0],[r14,#-14] 571 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 572# ifdef __thumb2__ 573 itt hs 574# endif 575 ldrhsb @t[2],[r12,#-5] 576 ldrhsb @t[3],[r12,#-1] 577 strb @x[$j+1],[r14,#-10] 578 strb @x[$j+2],[r14,#-6] 579 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 580 strb @x[$j+3],[r14,#-2] 581 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 582 strb @x[$j+0],[r14,#-13] 583 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 584 strb @x[$j+1],[r14,#-9] 585 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 586 strb @x[$j+2],[r14,#-5] 587 strb @x[$j+3],[r14,#-1] 588___ 589$code.=<<___ if ($i<12); 590 add @t[0],sp,#4*(4+$i) 591 ldmia @t[0],{@t[0]-@t[3]} @ load key material 592___ 593} 594$code.=<<___; 595# ifdef __thumb2__ 596 it ne 597# endif 598 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 599# ifdef __thumb2__ 600 it hs 601# endif 602 subhs @t[3],@t[0],#64 @ len-=64 603 bhi .Loop_outer 604 605 beq .Ldone 606#endif 607 608.Ltail: 609 ldr r12,[sp,#4*(32+1)] @ load inp 610 add @t[1],sp,#4*(0) 611 ldr r14,[sp,#4*(32+0)] @ load out 612 613.Loop_tail: 614 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 615 ldrb @t[3],[r12],#1 @ read input 616 subs @t[0],@t[0],#1 617 eor @t[3],@t[3],@t[2] 618 strb @t[3],[r14],#1 @ store output 619 bne .Loop_tail 620 621.Ldone: 622 add sp,sp,#4*(32+3) 623.Lno_data: 624 ldmia sp!,{r4-r11,pc} 625.size ChaCha20_ctr32,.-ChaCha20_ctr32 626___ 627 628{{{ 629my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 630 map("q$_",(0..15)); 631 632sub NEONROUND { 633my $odd = pop; 634my ($a,$b,$c,$d,$t)=@_; 635 636 ( 637 "&vadd_i32 ($a,$a,$b)", 638 "&veor ($d,$d,$a)", 639 "&vrev32_16 ($d,$d)", # vrot ($d,16) 640 641 "&vadd_i32 ($c,$c,$d)", 642 "&veor ($t,$b,$c)", 643 "&vshr_u32 ($b,$t,20)", 644 "&vsli_32 ($b,$t,12)", 645 646 "&vadd_i32 ($a,$a,$b)", 647 "&veor ($t,$d,$a)", 648 "&vshr_u32 ($d,$t,24)", 649 "&vsli_32 ($d,$t,8)", 650 651 "&vadd_i32 ($c,$c,$d)", 652 "&veor ($t,$b,$c)", 653 "&vshr_u32 ($b,$t,25)", 654 "&vsli_32 ($b,$t,7)", 655 656 "&vext_8 ($c,$c,$c,8)", 657 "&vext_8 ($b,$b,$b,$odd?12:4)", 658 "&vext_8 ($d,$d,$d,$odd?4:12)" 659 ); 660} 661 662$code.=<<___; 663#if __ARM_MAX_ARCH__>=7 664.arch armv7-a 665.fpu neon 666 667.type ChaCha20_neon,%function 668.align 5 669ChaCha20_neon: 670 ldr r12,[sp,#0] @ pull pointer to counter and nonce 671 stmdb sp!,{r0-r2,r4-r11,lr} 672.LChaCha20_neon: 673 adr r14,.Lsigma 674 vstmdb sp!,{d8-d15} @ ABI spec says so 675 stmdb sp!,{r0-r3} 676 677 vld1.32 {$b0-$c0},[r3] @ load key 678 ldmia r3,{r4-r11} @ load key 679 680 sub sp,sp,#4*(16+16) 681 vld1.32 {$d0},[r12] @ load counter and nonce 682 add r12,sp,#4*8 683 ldmia r14,{r0-r3} @ load sigma 684 vld1.32 {$a0},[r14]! @ load sigma 685 vld1.32 {$t0},[r14] @ one 686 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 687 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 688 689 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 690 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 691 vshl.i32 $t1#lo,$t0#lo,#1 @ two 692 vstr $t0#lo,[sp,#4*(16+0)] 693 vshl.i32 $t2#lo,$t0#lo,#2 @ four 694 vstr $t1#lo,[sp,#4*(16+2)] 695 vmov $a1,$a0 696 vstr $t2#lo,[sp,#4*(16+4)] 697 vmov $a2,$a0 698 vmov $b1,$b0 699 vmov $b2,$b0 700 b .Loop_neon_enter 701 702.align 4 703.Loop_neon_outer: 704 ldmia sp,{r0-r9} @ load key material 705 cmp @t[3],#64*2 @ if len<=64*2 706 bls .Lbreak_neon @ switch to integer-only 707 vmov $a1,$a0 708 str @t[3],[sp,#4*(32+2)] @ save len 709 vmov $a2,$a0 710 str r12, [sp,#4*(32+1)] @ save inp 711 vmov $b1,$b0 712 str r14, [sp,#4*(32+0)] @ save out 713 vmov $b2,$b0 714.Loop_neon_enter: 715 ldr @t[3], [sp,#4*(15)] 716 vadd.i32 $d1,$d0,$t0 @ counter+1 717 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 718 vmov $c1,$c0 719 ldr @t[2], [sp,#4*(13)] 720 vmov $c2,$c0 721 ldr @x[14],[sp,#4*(14)] 722 vadd.i32 $d2,$d1,$t0 @ counter+2 723 str @t[3], [sp,#4*(16+15)] 724 mov @t[3],#10 725 add @x[12],@x[12],#3 @ counter+3 726 b .Loop_neon 727 728.align 4 729.Loop_neon: 730 subs @t[3],@t[3],#1 731___ 732 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 733 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 734 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 735 my @thread3=&ROUND(0,4,8,12); 736 737 foreach (@thread0) { 738 eval; eval(shift(@thread3)); 739 eval(shift(@thread1)); eval(shift(@thread3)); 740 eval(shift(@thread2)); eval(shift(@thread3)); 741 } 742 743 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 744 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 745 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 746 @thread3=&ROUND(0,5,10,15); 747 748 foreach (@thread0) { 749 eval; eval(shift(@thread3)); 750 eval(shift(@thread1)); eval(shift(@thread3)); 751 eval(shift(@thread2)); eval(shift(@thread3)); 752 } 753$code.=<<___; 754 bne .Loop_neon 755 756 add @t[3],sp,#32 757 vld1.32 {$t0-$t1},[sp] @ load key material 758 vld1.32 {$t2-$t3},[@t[3]] 759 760 ldr @t[3],[sp,#4*(32+2)] @ load len 761 762 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 763 str @t[1], [sp,#4*(16+9)] 764 str @x[12],[sp,#4*(16+12)] 765 str @t[2], [sp,#4*(16+13)] 766 str @x[14],[sp,#4*(16+14)] 767 768 @ at this point we have first half of 512-bit result in 769 @ @x[0-7] and second half at sp+4*(16+8) 770 771 ldr r12,[sp,#4*(32+1)] @ load inp 772 ldr r14,[sp,#4*(32+0)] @ load out 773 774 vadd.i32 $a0,$a0,$t0 @ accumulate key material 775 vadd.i32 $a1,$a1,$t0 776 vadd.i32 $a2,$a2,$t0 777 vldr $t0#lo,[sp,#4*(16+0)] @ one 778 779 vadd.i32 $b0,$b0,$t1 780 vadd.i32 $b1,$b1,$t1 781 vadd.i32 $b2,$b2,$t1 782 vldr $t1#lo,[sp,#4*(16+2)] @ two 783 784 vadd.i32 $c0,$c0,$t2 785 vadd.i32 $c1,$c1,$t2 786 vadd.i32 $c2,$c2,$t2 787 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 788 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 789 790 vadd.i32 $d0,$d0,$t3 791 vadd.i32 $d1,$d1,$t3 792 vadd.i32 $d2,$d2,$t3 793 794 cmp @t[3],#64*4 795 blo .Ltail_neon 796 797 vld1.8 {$t0-$t1},[r12]! @ load input 798 mov @t[3],sp 799 vld1.8 {$t2-$t3},[r12]! 800 veor $a0,$a0,$t0 @ xor with input 801 veor $b0,$b0,$t1 802 vld1.8 {$t0-$t1},[r12]! 803 veor $c0,$c0,$t2 804 veor $d0,$d0,$t3 805 vld1.8 {$t2-$t3},[r12]! 806 807 veor $a1,$a1,$t0 808 vst1.8 {$a0-$b0},[r14]! @ store output 809 veor $b1,$b1,$t1 810 vld1.8 {$t0-$t1},[r12]! 811 veor $c1,$c1,$t2 812 vst1.8 {$c0-$d0},[r14]! 813 veor $d1,$d1,$t3 814 vld1.8 {$t2-$t3},[r12]! 815 816 veor $a2,$a2,$t0 817 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 818 veor $t0#hi,$t0#hi,$t0#hi 819 vldr $t0#lo,[sp,#4*(16+4)] @ four 820 veor $b2,$b2,$t1 821 vld1.32 {$c0-$d0},[@t[3]] 822 veor $c2,$c2,$t2 823 vst1.8 {$a1-$b1},[r14]! 824 veor $d2,$d2,$t3 825 vst1.8 {$c1-$d1},[r14]! 826 827 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 828 vldr $t0#lo,[sp,#4*(16+0)] @ one 829 830 ldmia sp,{@t[0]-@t[3]} @ load key material 831 add @x[0],@x[0],@t[0] @ accumulate key material 832 ldr @t[0],[r12],#16 @ load input 833 vst1.8 {$a2-$b2},[r14]! 834 add @x[1],@x[1],@t[1] 835 ldr @t[1],[r12,#-12] 836 vst1.8 {$c2-$d2},[r14]! 837 add @x[2],@x[2],@t[2] 838 ldr @t[2],[r12,#-8] 839 add @x[3],@x[3],@t[3] 840 ldr @t[3],[r12,#-4] 841# ifdef __ARMEB__ 842 rev @x[0],@x[0] 843 rev @x[1],@x[1] 844 rev @x[2],@x[2] 845 rev @x[3],@x[3] 846# endif 847 eor @x[0],@x[0],@t[0] @ xor with input 848 add @t[0],sp,#4*(4) 849 eor @x[1],@x[1],@t[1] 850 str @x[0],[r14],#16 @ store output 851 eor @x[2],@x[2],@t[2] 852 str @x[1],[r14,#-12] 853 eor @x[3],@x[3],@t[3] 854 ldmia @t[0],{@t[0]-@t[3]} @ load key material 855 str @x[2],[r14,#-8] 856 str @x[3],[r14,#-4] 857 858 add @x[4],@x[4],@t[0] @ accumulate key material 859 ldr @t[0],[r12],#16 @ load input 860 add @x[5],@x[5],@t[1] 861 ldr @t[1],[r12,#-12] 862 add @x[6],@x[6],@t[2] 863 ldr @t[2],[r12,#-8] 864 add @x[7],@x[7],@t[3] 865 ldr @t[3],[r12,#-4] 866# ifdef __ARMEB__ 867 rev @x[4],@x[4] 868 rev @x[5],@x[5] 869 rev @x[6],@x[6] 870 rev @x[7],@x[7] 871# endif 872 eor @x[4],@x[4],@t[0] 873 add @t[0],sp,#4*(8) 874 eor @x[5],@x[5],@t[1] 875 str @x[4],[r14],#16 @ store output 876 eor @x[6],@x[6],@t[2] 877 str @x[5],[r14,#-12] 878 eor @x[7],@x[7],@t[3] 879 ldmia @t[0],{@t[0]-@t[3]} @ load key material 880 str @x[6],[r14,#-8] 881 add @x[0],sp,#4*(16+8) 882 str @x[7],[r14,#-4] 883 884 ldmia @x[0],{@x[0]-@x[7]} @ load second half 885 886 add @x[0],@x[0],@t[0] @ accumulate key material 887 ldr @t[0],[r12],#16 @ load input 888 add @x[1],@x[1],@t[1] 889 ldr @t[1],[r12,#-12] 890# ifdef __thumb2__ 891 it hi 892# endif 893 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 894 add @x[2],@x[2],@t[2] 895 ldr @t[2],[r12,#-8] 896# ifdef __thumb2__ 897 it hi 898# endif 899 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 900 add @x[3],@x[3],@t[3] 901 ldr @t[3],[r12,#-4] 902# ifdef __ARMEB__ 903 rev @x[0],@x[0] 904 rev @x[1],@x[1] 905 rev @x[2],@x[2] 906 rev @x[3],@x[3] 907# endif 908 eor @x[0],@x[0],@t[0] 909 add @t[0],sp,#4*(12) 910 eor @x[1],@x[1],@t[1] 911 str @x[0],[r14],#16 @ store output 912 eor @x[2],@x[2],@t[2] 913 str @x[1],[r14,#-12] 914 eor @x[3],@x[3],@t[3] 915 ldmia @t[0],{@t[0]-@t[3]} @ load key material 916 str @x[2],[r14,#-8] 917 str @x[3],[r14,#-4] 918 919 add @x[4],@x[4],@t[0] @ accumulate key material 920 add @t[0],@t[0],#4 @ next counter value 921 add @x[5],@x[5],@t[1] 922 str @t[0],[sp,#4*(12)] @ save next counter value 923 ldr @t[0],[r12],#16 @ load input 924 add @x[6],@x[6],@t[2] 925 add @x[4],@x[4],#3 @ counter+3 926 ldr @t[1],[r12,#-12] 927 add @x[7],@x[7],@t[3] 928 ldr @t[2],[r12,#-8] 929 ldr @t[3],[r12,#-4] 930# ifdef __ARMEB__ 931 rev @x[4],@x[4] 932 rev @x[5],@x[5] 933 rev @x[6],@x[6] 934 rev @x[7],@x[7] 935# endif 936 eor @x[4],@x[4],@t[0] 937# ifdef __thumb2__ 938 it hi 939# endif 940 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 941 eor @x[5],@x[5],@t[1] 942 eor @x[6],@x[6],@t[2] 943 str @x[4],[r14],#16 @ store output 944 eor @x[7],@x[7],@t[3] 945 str @x[5],[r14,#-12] 946 sub @t[3],@t[0],#64*4 @ len-=64*4 947 str @x[6],[r14,#-8] 948 str @x[7],[r14,#-4] 949 bhi .Loop_neon_outer 950 951 b .Ldone_neon 952 953.align 4 954.Lbreak_neon: 955 @ harmonize NEON and integer-only stack frames: load data 956 @ from NEON frame, but save to integer-only one; distance 957 @ between the two is 4*(32+4+16-32)=4*(20). 958 959 str @t[3], [sp,#4*(20+32+2)] @ save len 960 add @t[3],sp,#4*(32+4) 961 str r12, [sp,#4*(20+32+1)] @ save inp 962 str r14, [sp,#4*(20+32+0)] @ save out 963 964 ldr @x[12],[sp,#4*(16+10)] 965 ldr @x[14],[sp,#4*(16+11)] 966 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 967 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 968 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 969 970 ldr @t[3], [sp,#4*(15)] 971 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 972 ldr @t[2], [sp,#4*(13)] 973 ldr @x[14],[sp,#4*(14)] 974 str @t[3], [sp,#4*(20+16+15)] 975 add @t[3],sp,#4*(20) 976 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 977 add sp,sp,#4*(20) @ switch frame 978 vst1.32 {$c0-$d0},[@t[3]] 979 mov @t[3],#10 980 b .Loop @ go integer-only 981 982.align 4 983.Ltail_neon: 984 cmp @t[3],#64*3 985 bhs .L192_or_more_neon 986 cmp @t[3],#64*2 987 bhs .L128_or_more_neon 988 cmp @t[3],#64*1 989 bhs .L64_or_more_neon 990 991 add @t[0],sp,#4*(8) 992 vst1.8 {$a0-$b0},[sp] 993 add @t[2],sp,#4*(0) 994 vst1.8 {$c0-$d0},[@t[0]] 995 b .Loop_tail_neon 996 997.align 4 998.L64_or_more_neon: 999 vld1.8 {$t0-$t1},[r12]! 1000 vld1.8 {$t2-$t3},[r12]! 1001 veor $a0,$a0,$t0 1002 veor $b0,$b0,$t1 1003 veor $c0,$c0,$t2 1004 veor $d0,$d0,$t3 1005 vst1.8 {$a0-$b0},[r14]! 1006 vst1.8 {$c0-$d0},[r14]! 1007 1008 beq .Ldone_neon 1009 1010 add @t[0],sp,#4*(8) 1011 vst1.8 {$a1-$b1},[sp] 1012 add @t[2],sp,#4*(0) 1013 vst1.8 {$c1-$d1},[@t[0]] 1014 sub @t[3],@t[3],#64*1 @ len-=64*1 1015 b .Loop_tail_neon 1016 1017.align 4 1018.L128_or_more_neon: 1019 vld1.8 {$t0-$t1},[r12]! 1020 vld1.8 {$t2-$t3},[r12]! 1021 veor $a0,$a0,$t0 1022 veor $b0,$b0,$t1 1023 vld1.8 {$t0-$t1},[r12]! 1024 veor $c0,$c0,$t2 1025 veor $d0,$d0,$t3 1026 vld1.8 {$t2-$t3},[r12]! 1027 1028 veor $a1,$a1,$t0 1029 veor $b1,$b1,$t1 1030 vst1.8 {$a0-$b0},[r14]! 1031 veor $c1,$c1,$t2 1032 vst1.8 {$c0-$d0},[r14]! 1033 veor $d1,$d1,$t3 1034 vst1.8 {$a1-$b1},[r14]! 1035 vst1.8 {$c1-$d1},[r14]! 1036 1037 beq .Ldone_neon 1038 1039 add @t[0],sp,#4*(8) 1040 vst1.8 {$a2-$b2},[sp] 1041 add @t[2],sp,#4*(0) 1042 vst1.8 {$c2-$d2},[@t[0]] 1043 sub @t[3],@t[3],#64*2 @ len-=64*2 1044 b .Loop_tail_neon 1045 1046.align 4 1047.L192_or_more_neon: 1048 vld1.8 {$t0-$t1},[r12]! 1049 vld1.8 {$t2-$t3},[r12]! 1050 veor $a0,$a0,$t0 1051 veor $b0,$b0,$t1 1052 vld1.8 {$t0-$t1},[r12]! 1053 veor $c0,$c0,$t2 1054 veor $d0,$d0,$t3 1055 vld1.8 {$t2-$t3},[r12]! 1056 1057 veor $a1,$a1,$t0 1058 veor $b1,$b1,$t1 1059 vld1.8 {$t0-$t1},[r12]! 1060 veor $c1,$c1,$t2 1061 vst1.8 {$a0-$b0},[r14]! 1062 veor $d1,$d1,$t3 1063 vld1.8 {$t2-$t3},[r12]! 1064 1065 veor $a2,$a2,$t0 1066 vst1.8 {$c0-$d0},[r14]! 1067 veor $b2,$b2,$t1 1068 vst1.8 {$a1-$b1},[r14]! 1069 veor $c2,$c2,$t2 1070 vst1.8 {$c1-$d1},[r14]! 1071 veor $d2,$d2,$t3 1072 vst1.8 {$a2-$b2},[r14]! 1073 vst1.8 {$c2-$d2},[r14]! 1074 1075 beq .Ldone_neon 1076 1077 ldmia sp,{@t[0]-@t[3]} @ load key material 1078 add @x[0],@x[0],@t[0] @ accumulate key material 1079 add @t[0],sp,#4*(4) 1080 add @x[1],@x[1],@t[1] 1081 add @x[2],@x[2],@t[2] 1082 add @x[3],@x[3],@t[3] 1083 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1084 1085 add @x[4],@x[4],@t[0] @ accumulate key material 1086 add @t[0],sp,#4*(8) 1087 add @x[5],@x[5],@t[1] 1088 add @x[6],@x[6],@t[2] 1089 add @x[7],@x[7],@t[3] 1090 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1091# ifdef __ARMEB__ 1092 rev @x[0],@x[0] 1093 rev @x[1],@x[1] 1094 rev @x[2],@x[2] 1095 rev @x[3],@x[3] 1096 rev @x[4],@x[4] 1097 rev @x[5],@x[5] 1098 rev @x[6],@x[6] 1099 rev @x[7],@x[7] 1100# endif 1101 stmia sp,{@x[0]-@x[7]} 1102 add @x[0],sp,#4*(16+8) 1103 1104 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1105 1106 add @x[0],@x[0],@t[0] @ accumulate key material 1107 add @t[0],sp,#4*(12) 1108 add @x[1],@x[1],@t[1] 1109 add @x[2],@x[2],@t[2] 1110 add @x[3],@x[3],@t[3] 1111 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1112 1113 add @x[4],@x[4],@t[0] @ accumulate key material 1114 add @t[0],sp,#4*(8) 1115 add @x[5],@x[5],@t[1] 1116 add @x[4],@x[4],#3 @ counter+3 1117 add @x[6],@x[6],@t[2] 1118 add @x[7],@x[7],@t[3] 1119 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1120# ifdef __ARMEB__ 1121 rev @x[0],@x[0] 1122 rev @x[1],@x[1] 1123 rev @x[2],@x[2] 1124 rev @x[3],@x[3] 1125 rev @x[4],@x[4] 1126 rev @x[5],@x[5] 1127 rev @x[6],@x[6] 1128 rev @x[7],@x[7] 1129# endif 1130 stmia @t[0],{@x[0]-@x[7]} 1131 add @t[2],sp,#4*(0) 1132 sub @t[3],@t[3],#64*3 @ len-=64*3 1133 1134.Loop_tail_neon: 1135 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1136 ldrb @t[1],[r12],#1 @ read input 1137 subs @t[3],@t[3],#1 1138 eor @t[0],@t[0],@t[1] 1139 strb @t[0],[r14],#1 @ store output 1140 bne .Loop_tail_neon 1141 1142.Ldone_neon: 1143 add sp,sp,#4*(32+4) 1144 vldmia sp,{d8-d15} 1145 add sp,sp,#4*(16+3) 1146 ldmia sp!,{r4-r11,pc} 1147.size ChaCha20_neon,.-ChaCha20_neon 1148.comm OPENSSL_armcap_P,4,4 1149#endif 1150___ 1151}}} 1152 1153foreach (split("\n",$code)) { 1154 s/\`([^\`]*)\`/eval $1/geo; 1155 1156 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1157 1158 print $_,"\n"; 1159} 1160close STDOUT; 1161