1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2014 18# 19# ChaCha20 for ARMv4. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 24# 25# Cortex-A5 19.3(*)/+95% 21.8 14.1 26# Cortex-A8 10.5(*)/+160% 13.9 6.35 27# Cortex-A9 12.9(**)/+110% 14.3 6.50 28# Cortex-A15 11.0/+40% 16.0 5.00 29# Snapdragon S4 11.5/+125% 13.6 4.90 30# 31# (*) most "favourable" result for aligned data on little-endian 32# processor, result for misaligned data is 10-15% lower; 33# (**) this result is a trade-off: it can be improved by 20%, 34# but then Snapdragon S4 and Cortex-A8 results get 35# 20-25% worse; 36 37$flavour = shift; 38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 40 41if ($flavour && $flavour ne "void") { 42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45 die "can't locate arm-xlate.pl"; 46 47 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 48} else { 49 open STDOUT,">$output"; 50} 51 52sub AUTOLOAD() # thunk [simplified] x86-style perlasm 53{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 54 my $arg = pop; 55 $arg = "#$arg" if ($arg*1 eq $arg); 56 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 57} 58 59my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 60my @t=map("r$_",(8..11)); 61 62sub ROUND { 63my ($a0,$b0,$c0,$d0)=@_; 64my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 65my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 66my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 67my $odd = $d0&1; 68my ($xc,$xc_) = (@t[0..1]); 69my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 70my @ret; 71 72 # Consider order in which variables are addressed by their 73 # index: 74 # 75 # a b c d 76 # 77 # 0 4 8 12 < even round 78 # 1 5 9 13 79 # 2 6 10 14 80 # 3 7 11 15 81 # 0 5 10 15 < odd round 82 # 1 6 11 12 83 # 2 7 8 13 84 # 3 4 9 14 85 # 86 # 'a', 'b' are permanently allocated in registers, @x[0..7], 87 # while 'c's and pair of 'd's are maintained in memory. If 88 # you observe 'c' column, you'll notice that pair of 'c's is 89 # invariant between rounds. This means that we have to reload 90 # them once per round, in the middle. This is why you'll see 91 # bunch of 'c' stores and loads in the middle, but none in 92 # the beginning or end. If you observe 'd' column, you'll 93 # notice that 15 and 13 are reused in next pair of rounds. 94 # This is why these two are chosen for offloading to memory, 95 # to make loads count more. 96 push @ret,( 97 "&add (@x[$a0],@x[$a0],@x[$b0])", 98 "&mov ($xd,$xd,'ror#16')", 99 "&add (@x[$a1],@x[$a1],@x[$b1])", 100 "&mov ($xd_,$xd_,'ror#16')", 101 "&eor ($xd,$xd,@x[$a0],'ror#16')", 102 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 103 104 "&add ($xc,$xc,$xd)", 105 "&mov (@x[$b0],@x[$b0],'ror#20')", 106 "&add ($xc_,$xc_,$xd_)", 107 "&mov (@x[$b1],@x[$b1],'ror#20')", 108 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 109 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 110 111 "&add (@x[$a0],@x[$a0],@x[$b0])", 112 "&mov ($xd,$xd,'ror#24')", 113 "&add (@x[$a1],@x[$a1],@x[$b1])", 114 "&mov ($xd_,$xd_,'ror#24')", 115 "&eor ($xd,$xd,@x[$a0],'ror#24')", 116 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 117 118 "&add ($xc,$xc,$xd)", 119 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 120 push @ret,( 121 "&str ($xd,'[sp,#4*(16+$d0)]')", 122 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 123 push @ret,( 124 "&add ($xc_,$xc_,$xd_)", 125 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 126 push @ret,( 127 "&str ($xd_,'[sp,#4*(16+$d1)]')", 128 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 129 push @ret,( 130 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 131 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 132 133 $xd=@x[$d2] if (!$odd); 134 $xd_=@x[$d3] if ($odd); 135 push @ret,( 136 "&str ($xc,'[sp,#4*(16+$c0)]')", 137 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 138 "&add (@x[$a2],@x[$a2],@x[$b2])", 139 "&mov ($xd,$xd,'ror#16')", 140 "&str ($xc_,'[sp,#4*(16+$c1)]')", 141 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 142 "&add (@x[$a3],@x[$a3],@x[$b3])", 143 "&mov ($xd_,$xd_,'ror#16')", 144 "&eor ($xd,$xd,@x[$a2],'ror#16')", 145 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 146 147 "&add ($xc,$xc,$xd)", 148 "&mov (@x[$b2],@x[$b2],'ror#20')", 149 "&add ($xc_,$xc_,$xd_)", 150 "&mov (@x[$b3],@x[$b3],'ror#20')", 151 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 152 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 153 154 "&add (@x[$a2],@x[$a2],@x[$b2])", 155 "&mov ($xd,$xd,'ror#24')", 156 "&add (@x[$a3],@x[$a3],@x[$b3])", 157 "&mov ($xd_,$xd_,'ror#24')", 158 "&eor ($xd,$xd,@x[$a2],'ror#24')", 159 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 160 161 "&add ($xc,$xc,$xd)", 162 "&mov (@x[$b2],@x[$b2],'ror#25')", 163 "&add ($xc_,$xc_,$xd_)", 164 "&mov (@x[$b3],@x[$b3],'ror#25')", 165 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 166 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 167 168 @ret; 169} 170 171$code.=<<___; 172#include <openssl/arm_arch.h> 173 174@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 175@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. 176.arch armv7-a 177 178.text 179#if defined(__thumb2__) || defined(__clang__) 180.syntax unified 181#endif 182#if defined(__thumb2__) 183.thumb 184#else 185.code 32 186#endif 187 188#if defined(__thumb2__) || defined(__clang__) 189#define ldrhsb ldrbhs 190#endif 191 192.align 5 193.Lsigma: 194.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 195.Lone: 196.long 1,0,0,0 197#if __ARM_MAX_ARCH__>=7 198.LOPENSSL_armcap: 199.word OPENSSL_armcap_P-.LChaCha20_ctr32 200#else 201.word -1 202#endif 203 204.globl ChaCha20_ctr32 205.type ChaCha20_ctr32,%function 206.align 5 207ChaCha20_ctr32: 208.LChaCha20_ctr32: 209 ldr r12,[sp,#0] @ pull pointer to counter and nonce 210 stmdb sp!,{r0-r2,r4-r11,lr} 211#if __ARM_ARCH__<7 && !defined(__thumb2__) 212 sub r14,pc,#16 @ ChaCha20_ctr32 213#else 214 adr r14,.LChaCha20_ctr32 215#endif 216 cmp r2,#0 @ len==0? 217#ifdef __thumb2__ 218 itt eq 219#endif 220 addeq sp,sp,#4*3 221 beq .Lno_data 222#if __ARM_MAX_ARCH__>=7 223 cmp r2,#192 @ test len 224 bls .Lshort 225 ldr r4,[r14,#-32] 226 ldr r4,[r14,r4] 227# ifdef __APPLE__ 228 ldr r4,[r4] 229# endif 230 tst r4,#ARMV7_NEON 231 bne .LChaCha20_neon 232.Lshort: 233#endif 234 ldmia r12,{r4-r7} @ load counter and nonce 235 sub sp,sp,#4*(16) @ off-load area 236 sub r14,r14,#64 @ .Lsigma 237 stmdb sp!,{r4-r7} @ copy counter and nonce 238 ldmia r3,{r4-r11} @ load key 239 ldmia r14,{r0-r3} @ load sigma 240 stmdb sp!,{r4-r11} @ copy key 241 stmdb sp!,{r0-r3} @ copy sigma 242 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 243 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 244 b .Loop_outer_enter 245 246.align 4 247.Loop_outer: 248 ldmia sp,{r0-r9} @ load key material 249 str @t[3],[sp,#4*(32+2)] @ save len 250 str r12, [sp,#4*(32+1)] @ save inp 251 str r14, [sp,#4*(32+0)] @ save out 252.Loop_outer_enter: 253 ldr @t[3], [sp,#4*(15)] 254 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 255 ldr @t[2], [sp,#4*(13)] 256 ldr @x[14],[sp,#4*(14)] 257 str @t[3], [sp,#4*(16+15)] 258 mov @t[3],#10 259 b .Loop 260 261.align 4 262.Loop: 263 subs @t[3],@t[3],#1 264___ 265 foreach (&ROUND(0, 4, 8,12)) { eval; } 266 foreach (&ROUND(0, 5,10,15)) { eval; } 267$code.=<<___; 268 bne .Loop 269 270 ldr @t[3],[sp,#4*(32+2)] @ load len 271 272 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 273 str @t[1], [sp,#4*(16+9)] 274 str @x[12],[sp,#4*(16+12)] 275 str @t[2], [sp,#4*(16+13)] 276 str @x[14],[sp,#4*(16+14)] 277 278 @ at this point we have first half of 512-bit result in 279 @ @x[0-7] and second half at sp+4*(16+8) 280 281 cmp @t[3],#64 @ done yet? 282#ifdef __thumb2__ 283 itete lo 284#endif 285 addlo r12,sp,#4*(0) @ shortcut or ... 286 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 287 addlo r14,sp,#4*(0) @ shortcut or ... 288 ldrhs r14,[sp,#4*(32+0)] @ ... load out 289 290 ldr @t[0],[sp,#4*(0)] @ load key material 291 ldr @t[1],[sp,#4*(1)] 292 293#if __ARM_ARCH__>=6 || !defined(__ARMEB__) 294# if __ARM_ARCH__<7 295 orr @t[2],r12,r14 296 tst @t[2],#3 @ are input and output aligned? 297 ldr @t[2],[sp,#4*(2)] 298 bne .Lunaligned 299 cmp @t[3],#64 @ restore flags 300# else 301 ldr @t[2],[sp,#4*(2)] 302# endif 303 ldr @t[3],[sp,#4*(3)] 304 305 add @x[0],@x[0],@t[0] @ accumulate key material 306 add @x[1],@x[1],@t[1] 307# ifdef __thumb2__ 308 itt hs 309# endif 310 ldrhs @t[0],[r12],#16 @ load input 311 ldrhs @t[1],[r12,#-12] 312 313 add @x[2],@x[2],@t[2] 314 add @x[3],@x[3],@t[3] 315# ifdef __thumb2__ 316 itt hs 317# endif 318 ldrhs @t[2],[r12,#-8] 319 ldrhs @t[3],[r12,#-4] 320# if __ARM_ARCH__>=6 && defined(__ARMEB__) 321 rev @x[0],@x[0] 322 rev @x[1],@x[1] 323 rev @x[2],@x[2] 324 rev @x[3],@x[3] 325# endif 326# ifdef __thumb2__ 327 itt hs 328# endif 329 eorhs @x[0],@x[0],@t[0] @ xor with input 330 eorhs @x[1],@x[1],@t[1] 331 add @t[0],sp,#4*(4) 332 str @x[0],[r14],#16 @ store output 333# ifdef __thumb2__ 334 itt hs 335# endif 336 eorhs @x[2],@x[2],@t[2] 337 eorhs @x[3],@x[3],@t[3] 338 ldmia @t[0],{@t[0]-@t[3]} @ load key material 339 str @x[1],[r14,#-12] 340 str @x[2],[r14,#-8] 341 str @x[3],[r14,#-4] 342 343 add @x[4],@x[4],@t[0] @ accumulate key material 344 add @x[5],@x[5],@t[1] 345# ifdef __thumb2__ 346 itt hs 347# endif 348 ldrhs @t[0],[r12],#16 @ load input 349 ldrhs @t[1],[r12,#-12] 350 add @x[6],@x[6],@t[2] 351 add @x[7],@x[7],@t[3] 352# ifdef __thumb2__ 353 itt hs 354# endif 355 ldrhs @t[2],[r12,#-8] 356 ldrhs @t[3],[r12,#-4] 357# if __ARM_ARCH__>=6 && defined(__ARMEB__) 358 rev @x[4],@x[4] 359 rev @x[5],@x[5] 360 rev @x[6],@x[6] 361 rev @x[7],@x[7] 362# endif 363# ifdef __thumb2__ 364 itt hs 365# endif 366 eorhs @x[4],@x[4],@t[0] 367 eorhs @x[5],@x[5],@t[1] 368 add @t[0],sp,#4*(8) 369 str @x[4],[r14],#16 @ store output 370# ifdef __thumb2__ 371 itt hs 372# endif 373 eorhs @x[6],@x[6],@t[2] 374 eorhs @x[7],@x[7],@t[3] 375 str @x[5],[r14,#-12] 376 ldmia @t[0],{@t[0]-@t[3]} @ load key material 377 str @x[6],[r14,#-8] 378 add @x[0],sp,#4*(16+8) 379 str @x[7],[r14,#-4] 380 381 ldmia @x[0],{@x[0]-@x[7]} @ load second half 382 383 add @x[0],@x[0],@t[0] @ accumulate key material 384 add @x[1],@x[1],@t[1] 385# ifdef __thumb2__ 386 itt hs 387# endif 388 ldrhs @t[0],[r12],#16 @ load input 389 ldrhs @t[1],[r12,#-12] 390# ifdef __thumb2__ 391 itt hi 392# endif 393 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 394 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 395 add @x[2],@x[2],@t[2] 396 add @x[3],@x[3],@t[3] 397# ifdef __thumb2__ 398 itt hs 399# endif 400 ldrhs @t[2],[r12,#-8] 401 ldrhs @t[3],[r12,#-4] 402# if __ARM_ARCH__>=6 && defined(__ARMEB__) 403 rev @x[0],@x[0] 404 rev @x[1],@x[1] 405 rev @x[2],@x[2] 406 rev @x[3],@x[3] 407# endif 408# ifdef __thumb2__ 409 itt hs 410# endif 411 eorhs @x[0],@x[0],@t[0] 412 eorhs @x[1],@x[1],@t[1] 413 add @t[0],sp,#4*(12) 414 str @x[0],[r14],#16 @ store output 415# ifdef __thumb2__ 416 itt hs 417# endif 418 eorhs @x[2],@x[2],@t[2] 419 eorhs @x[3],@x[3],@t[3] 420 str @x[1],[r14,#-12] 421 ldmia @t[0],{@t[0]-@t[3]} @ load key material 422 str @x[2],[r14,#-8] 423 str @x[3],[r14,#-4] 424 425 add @x[4],@x[4],@t[0] @ accumulate key material 426 add @x[5],@x[5],@t[1] 427# ifdef __thumb2__ 428 itt hi 429# endif 430 addhi @t[0],@t[0],#1 @ next counter value 431 strhi @t[0],[sp,#4*(12)] @ save next counter value 432# ifdef __thumb2__ 433 itt hs 434# endif 435 ldrhs @t[0],[r12],#16 @ load input 436 ldrhs @t[1],[r12,#-12] 437 add @x[6],@x[6],@t[2] 438 add @x[7],@x[7],@t[3] 439# ifdef __thumb2__ 440 itt hs 441# endif 442 ldrhs @t[2],[r12,#-8] 443 ldrhs @t[3],[r12,#-4] 444# if __ARM_ARCH__>=6 && defined(__ARMEB__) 445 rev @x[4],@x[4] 446 rev @x[5],@x[5] 447 rev @x[6],@x[6] 448 rev @x[7],@x[7] 449# endif 450# ifdef __thumb2__ 451 itt hs 452# endif 453 eorhs @x[4],@x[4],@t[0] 454 eorhs @x[5],@x[5],@t[1] 455# ifdef __thumb2__ 456 it ne 457# endif 458 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 459# ifdef __thumb2__ 460 itt hs 461# endif 462 eorhs @x[6],@x[6],@t[2] 463 eorhs @x[7],@x[7],@t[3] 464 str @x[4],[r14],#16 @ store output 465 str @x[5],[r14,#-12] 466# ifdef __thumb2__ 467 it hs 468# endif 469 subhs @t[3],@t[0],#64 @ len-=64 470 str @x[6],[r14,#-8] 471 str @x[7],[r14,#-4] 472 bhi .Loop_outer 473 474 beq .Ldone 475# if __ARM_ARCH__<7 476 b .Ltail 477 478.align 4 479.Lunaligned: @ unaligned endian-neutral path 480 cmp @t[3],#64 @ restore flags 481# endif 482#endif 483#if __ARM_ARCH__<7 484 ldr @t[3],[sp,#4*(3)] 485___ 486for ($i=0;$i<16;$i+=4) { 487my $j=$i&0x7; 488 489$code.=<<___ if ($i==4); 490 add @x[0],sp,#4*(16+8) 491___ 492$code.=<<___ if ($i==8); 493 ldmia @x[0],{@x[0]-@x[7]} @ load second half 494# ifdef __thumb2__ 495 itt hi 496# endif 497 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 498 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 499___ 500$code.=<<___; 501 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 502___ 503$code.=<<___ if ($i==12); 504# ifdef __thumb2__ 505 itt hi 506# endif 507 addhi @t[0],@t[0],#1 @ next counter value 508 strhi @t[0],[sp,#4*(12)] @ save next counter value 509___ 510$code.=<<___; 511 add @x[$j+1],@x[$j+1],@t[1] 512 add @x[$j+2],@x[$j+2],@t[2] 513# ifdef __thumb2__ 514 itete lo 515# endif 516 eorlo @t[0],@t[0],@t[0] @ zero or ... 517 ldrhsb @t[0],[r12],#16 @ ... load input 518 eorlo @t[1],@t[1],@t[1] 519 ldrhsb @t[1],[r12,#-12] 520 521 add @x[$j+3],@x[$j+3],@t[3] 522# ifdef __thumb2__ 523 itete lo 524# endif 525 eorlo @t[2],@t[2],@t[2] 526 ldrhsb @t[2],[r12,#-8] 527 eorlo @t[3],@t[3],@t[3] 528 ldrhsb @t[3],[r12,#-4] 529 530 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 531 eor @x[$j+1],@t[1],@x[$j+1] 532# ifdef __thumb2__ 533 itt hs 534# endif 535 ldrhsb @t[0],[r12,#-15] @ load more input 536 ldrhsb @t[1],[r12,#-11] 537 eor @x[$j+2],@t[2],@x[$j+2] 538 strb @x[$j+0],[r14],#16 @ store output 539 eor @x[$j+3],@t[3],@x[$j+3] 540# ifdef __thumb2__ 541 itt hs 542# endif 543 ldrhsb @t[2],[r12,#-7] 544 ldrhsb @t[3],[r12,#-3] 545 strb @x[$j+1],[r14,#-12] 546 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 547 strb @x[$j+2],[r14,#-8] 548 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 549# ifdef __thumb2__ 550 itt hs 551# endif 552 ldrhsb @t[0],[r12,#-14] @ load more input 553 ldrhsb @t[1],[r12,#-10] 554 strb @x[$j+3],[r14,#-4] 555 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 556 strb @x[$j+0],[r14,#-15] 557 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 558# ifdef __thumb2__ 559 itt hs 560# endif 561 ldrhsb @t[2],[r12,#-6] 562 ldrhsb @t[3],[r12,#-2] 563 strb @x[$j+1],[r14,#-11] 564 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 565 strb @x[$j+2],[r14,#-7] 566 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 567# ifdef __thumb2__ 568 itt hs 569# endif 570 ldrhsb @t[0],[r12,#-13] @ load more input 571 ldrhsb @t[1],[r12,#-9] 572 strb @x[$j+3],[r14,#-3] 573 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 574 strb @x[$j+0],[r14,#-14] 575 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 576# ifdef __thumb2__ 577 itt hs 578# endif 579 ldrhsb @t[2],[r12,#-5] 580 ldrhsb @t[3],[r12,#-1] 581 strb @x[$j+1],[r14,#-10] 582 strb @x[$j+2],[r14,#-6] 583 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 584 strb @x[$j+3],[r14,#-2] 585 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 586 strb @x[$j+0],[r14,#-13] 587 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 588 strb @x[$j+1],[r14,#-9] 589 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 590 strb @x[$j+2],[r14,#-5] 591 strb @x[$j+3],[r14,#-1] 592___ 593$code.=<<___ if ($i<12); 594 add @t[0],sp,#4*(4+$i) 595 ldmia @t[0],{@t[0]-@t[3]} @ load key material 596___ 597} 598$code.=<<___; 599# ifdef __thumb2__ 600 it ne 601# endif 602 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 603# ifdef __thumb2__ 604 it hs 605# endif 606 subhs @t[3],@t[0],#64 @ len-=64 607 bhi .Loop_outer 608 609 beq .Ldone 610#endif 611 612.Ltail: 613 ldr r12,[sp,#4*(32+1)] @ load inp 614 add @t[1],sp,#4*(0) 615 ldr r14,[sp,#4*(32+0)] @ load out 616 617.Loop_tail: 618 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 619 ldrb @t[3],[r12],#1 @ read input 620 subs @t[0],@t[0],#1 621 eor @t[3],@t[3],@t[2] 622 strb @t[3],[r14],#1 @ store output 623 bne .Loop_tail 624 625.Ldone: 626 add sp,sp,#4*(32+3) 627.Lno_data: 628 ldmia sp!,{r4-r11,pc} 629.size ChaCha20_ctr32,.-ChaCha20_ctr32 630___ 631 632{{{ 633my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 634 map("q$_",(0..15)); 635 636sub NEONROUND { 637my $odd = pop; 638my ($a,$b,$c,$d,$t)=@_; 639 640 ( 641 "&vadd_i32 ($a,$a,$b)", 642 "&veor ($d,$d,$a)", 643 "&vrev32_16 ($d,$d)", # vrot ($d,16) 644 645 "&vadd_i32 ($c,$c,$d)", 646 "&veor ($t,$b,$c)", 647 "&vshr_u32 ($b,$t,20)", 648 "&vsli_32 ($b,$t,12)", 649 650 "&vadd_i32 ($a,$a,$b)", 651 "&veor ($t,$d,$a)", 652 "&vshr_u32 ($d,$t,24)", 653 "&vsli_32 ($d,$t,8)", 654 655 "&vadd_i32 ($c,$c,$d)", 656 "&veor ($t,$b,$c)", 657 "&vshr_u32 ($b,$t,25)", 658 "&vsli_32 ($b,$t,7)", 659 660 "&vext_8 ($c,$c,$c,8)", 661 "&vext_8 ($b,$b,$b,$odd?12:4)", 662 "&vext_8 ($d,$d,$d,$odd?4:12)" 663 ); 664} 665 666$code.=<<___; 667#if __ARM_MAX_ARCH__>=7 668.arch armv7-a 669.fpu neon 670 671.type ChaCha20_neon,%function 672.align 5 673ChaCha20_neon: 674 ldr r12,[sp,#0] @ pull pointer to counter and nonce 675 stmdb sp!,{r0-r2,r4-r11,lr} 676.LChaCha20_neon: 677 adr r14,.Lsigma 678 vstmdb sp!,{d8-d15} @ ABI spec says so 679 stmdb sp!,{r0-r3} 680 681 vld1.32 {$b0-$c0},[r3] @ load key 682 ldmia r3,{r4-r11} @ load key 683 684 sub sp,sp,#4*(16+16) 685 vld1.32 {$d0},[r12] @ load counter and nonce 686 add r12,sp,#4*8 687 ldmia r14,{r0-r3} @ load sigma 688 vld1.32 {$a0},[r14]! @ load sigma 689 vld1.32 {$t0},[r14] @ one 690 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 691 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 692 693 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 694 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 695 vshl.i32 $t1#lo,$t0#lo,#1 @ two 696 vstr $t0#lo,[sp,#4*(16+0)] 697 vshl.i32 $t2#lo,$t0#lo,#2 @ four 698 vstr $t1#lo,[sp,#4*(16+2)] 699 vmov $a1,$a0 700 vstr $t2#lo,[sp,#4*(16+4)] 701 vmov $a2,$a0 702 vmov $b1,$b0 703 vmov $b2,$b0 704 b .Loop_neon_enter 705 706.align 4 707.Loop_neon_outer: 708 ldmia sp,{r0-r9} @ load key material 709 cmp @t[3],#64*2 @ if len<=64*2 710 bls .Lbreak_neon @ switch to integer-only 711 vmov $a1,$a0 712 str @t[3],[sp,#4*(32+2)] @ save len 713 vmov $a2,$a0 714 str r12, [sp,#4*(32+1)] @ save inp 715 vmov $b1,$b0 716 str r14, [sp,#4*(32+0)] @ save out 717 vmov $b2,$b0 718.Loop_neon_enter: 719 ldr @t[3], [sp,#4*(15)] 720 vadd.i32 $d1,$d0,$t0 @ counter+1 721 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 722 vmov $c1,$c0 723 ldr @t[2], [sp,#4*(13)] 724 vmov $c2,$c0 725 ldr @x[14],[sp,#4*(14)] 726 vadd.i32 $d2,$d1,$t0 @ counter+2 727 str @t[3], [sp,#4*(16+15)] 728 mov @t[3],#10 729 add @x[12],@x[12],#3 @ counter+3 730 b .Loop_neon 731 732.align 4 733.Loop_neon: 734 subs @t[3],@t[3],#1 735___ 736 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 737 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 738 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 739 my @thread3=&ROUND(0,4,8,12); 740 741 foreach (@thread0) { 742 eval; eval(shift(@thread3)); 743 eval(shift(@thread1)); eval(shift(@thread3)); 744 eval(shift(@thread2)); eval(shift(@thread3)); 745 } 746 747 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 748 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 749 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 750 @thread3=&ROUND(0,5,10,15); 751 752 foreach (@thread0) { 753 eval; eval(shift(@thread3)); 754 eval(shift(@thread1)); eval(shift(@thread3)); 755 eval(shift(@thread2)); eval(shift(@thread3)); 756 } 757$code.=<<___; 758 bne .Loop_neon 759 760 add @t[3],sp,#32 761 vld1.32 {$t0-$t1},[sp] @ load key material 762 vld1.32 {$t2-$t3},[@t[3]] 763 764 ldr @t[3],[sp,#4*(32+2)] @ load len 765 766 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 767 str @t[1], [sp,#4*(16+9)] 768 str @x[12],[sp,#4*(16+12)] 769 str @t[2], [sp,#4*(16+13)] 770 str @x[14],[sp,#4*(16+14)] 771 772 @ at this point we have first half of 512-bit result in 773 @ @x[0-7] and second half at sp+4*(16+8) 774 775 ldr r12,[sp,#4*(32+1)] @ load inp 776 ldr r14,[sp,#4*(32+0)] @ load out 777 778 vadd.i32 $a0,$a0,$t0 @ accumulate key material 779 vadd.i32 $a1,$a1,$t0 780 vadd.i32 $a2,$a2,$t0 781 vldr $t0#lo,[sp,#4*(16+0)] @ one 782 783 vadd.i32 $b0,$b0,$t1 784 vadd.i32 $b1,$b1,$t1 785 vadd.i32 $b2,$b2,$t1 786 vldr $t1#lo,[sp,#4*(16+2)] @ two 787 788 vadd.i32 $c0,$c0,$t2 789 vadd.i32 $c1,$c1,$t2 790 vadd.i32 $c2,$c2,$t2 791 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 792 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 793 794 vadd.i32 $d0,$d0,$t3 795 vadd.i32 $d1,$d1,$t3 796 vadd.i32 $d2,$d2,$t3 797 798 cmp @t[3],#64*4 799 blo .Ltail_neon 800 801 vld1.8 {$t0-$t1},[r12]! @ load input 802 mov @t[3],sp 803 vld1.8 {$t2-$t3},[r12]! 804 veor $a0,$a0,$t0 @ xor with input 805 veor $b0,$b0,$t1 806 vld1.8 {$t0-$t1},[r12]! 807 veor $c0,$c0,$t2 808 veor $d0,$d0,$t3 809 vld1.8 {$t2-$t3},[r12]! 810 811 veor $a1,$a1,$t0 812 vst1.8 {$a0-$b0},[r14]! @ store output 813 veor $b1,$b1,$t1 814 vld1.8 {$t0-$t1},[r12]! 815 veor $c1,$c1,$t2 816 vst1.8 {$c0-$d0},[r14]! 817 veor $d1,$d1,$t3 818 vld1.8 {$t2-$t3},[r12]! 819 820 veor $a2,$a2,$t0 821 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 822 veor $t0#hi,$t0#hi,$t0#hi 823 vldr $t0#lo,[sp,#4*(16+4)] @ four 824 veor $b2,$b2,$t1 825 vld1.32 {$c0-$d0},[@t[3]] 826 veor $c2,$c2,$t2 827 vst1.8 {$a1-$b1},[r14]! 828 veor $d2,$d2,$t3 829 vst1.8 {$c1-$d1},[r14]! 830 831 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 832 vldr $t0#lo,[sp,#4*(16+0)] @ one 833 834 ldmia sp,{@t[0]-@t[3]} @ load key material 835 add @x[0],@x[0],@t[0] @ accumulate key material 836 ldr @t[0],[r12],#16 @ load input 837 vst1.8 {$a2-$b2},[r14]! 838 add @x[1],@x[1],@t[1] 839 ldr @t[1],[r12,#-12] 840 vst1.8 {$c2-$d2},[r14]! 841 add @x[2],@x[2],@t[2] 842 ldr @t[2],[r12,#-8] 843 add @x[3],@x[3],@t[3] 844 ldr @t[3],[r12,#-4] 845# ifdef __ARMEB__ 846 rev @x[0],@x[0] 847 rev @x[1],@x[1] 848 rev @x[2],@x[2] 849 rev @x[3],@x[3] 850# endif 851 eor @x[0],@x[0],@t[0] @ xor with input 852 add @t[0],sp,#4*(4) 853 eor @x[1],@x[1],@t[1] 854 str @x[0],[r14],#16 @ store output 855 eor @x[2],@x[2],@t[2] 856 str @x[1],[r14,#-12] 857 eor @x[3],@x[3],@t[3] 858 ldmia @t[0],{@t[0]-@t[3]} @ load key material 859 str @x[2],[r14,#-8] 860 str @x[3],[r14,#-4] 861 862 add @x[4],@x[4],@t[0] @ accumulate key material 863 ldr @t[0],[r12],#16 @ load input 864 add @x[5],@x[5],@t[1] 865 ldr @t[1],[r12,#-12] 866 add @x[6],@x[6],@t[2] 867 ldr @t[2],[r12,#-8] 868 add @x[7],@x[7],@t[3] 869 ldr @t[3],[r12,#-4] 870# ifdef __ARMEB__ 871 rev @x[4],@x[4] 872 rev @x[5],@x[5] 873 rev @x[6],@x[6] 874 rev @x[7],@x[7] 875# endif 876 eor @x[4],@x[4],@t[0] 877 add @t[0],sp,#4*(8) 878 eor @x[5],@x[5],@t[1] 879 str @x[4],[r14],#16 @ store output 880 eor @x[6],@x[6],@t[2] 881 str @x[5],[r14,#-12] 882 eor @x[7],@x[7],@t[3] 883 ldmia @t[0],{@t[0]-@t[3]} @ load key material 884 str @x[6],[r14,#-8] 885 add @x[0],sp,#4*(16+8) 886 str @x[7],[r14,#-4] 887 888 ldmia @x[0],{@x[0]-@x[7]} @ load second half 889 890 add @x[0],@x[0],@t[0] @ accumulate key material 891 ldr @t[0],[r12],#16 @ load input 892 add @x[1],@x[1],@t[1] 893 ldr @t[1],[r12,#-12] 894# ifdef __thumb2__ 895 it hi 896# endif 897 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 898 add @x[2],@x[2],@t[2] 899 ldr @t[2],[r12,#-8] 900# ifdef __thumb2__ 901 it hi 902# endif 903 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 904 add @x[3],@x[3],@t[3] 905 ldr @t[3],[r12,#-4] 906# ifdef __ARMEB__ 907 rev @x[0],@x[0] 908 rev @x[1],@x[1] 909 rev @x[2],@x[2] 910 rev @x[3],@x[3] 911# endif 912 eor @x[0],@x[0],@t[0] 913 add @t[0],sp,#4*(12) 914 eor @x[1],@x[1],@t[1] 915 str @x[0],[r14],#16 @ store output 916 eor @x[2],@x[2],@t[2] 917 str @x[1],[r14,#-12] 918 eor @x[3],@x[3],@t[3] 919 ldmia @t[0],{@t[0]-@t[3]} @ load key material 920 str @x[2],[r14,#-8] 921 str @x[3],[r14,#-4] 922 923 add @x[4],@x[4],@t[0] @ accumulate key material 924 add @t[0],@t[0],#4 @ next counter value 925 add @x[5],@x[5],@t[1] 926 str @t[0],[sp,#4*(12)] @ save next counter value 927 ldr @t[0],[r12],#16 @ load input 928 add @x[6],@x[6],@t[2] 929 add @x[4],@x[4],#3 @ counter+3 930 ldr @t[1],[r12,#-12] 931 add @x[7],@x[7],@t[3] 932 ldr @t[2],[r12,#-8] 933 ldr @t[3],[r12,#-4] 934# ifdef __ARMEB__ 935 rev @x[4],@x[4] 936 rev @x[5],@x[5] 937 rev @x[6],@x[6] 938 rev @x[7],@x[7] 939# endif 940 eor @x[4],@x[4],@t[0] 941# ifdef __thumb2__ 942 it hi 943# endif 944 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 945 eor @x[5],@x[5],@t[1] 946 eor @x[6],@x[6],@t[2] 947 str @x[4],[r14],#16 @ store output 948 eor @x[7],@x[7],@t[3] 949 str @x[5],[r14,#-12] 950 sub @t[3],@t[0],#64*4 @ len-=64*4 951 str @x[6],[r14,#-8] 952 str @x[7],[r14,#-4] 953 bhi .Loop_neon_outer 954 955 b .Ldone_neon 956 957.align 4 958.Lbreak_neon: 959 @ harmonize NEON and integer-only stack frames: load data 960 @ from NEON frame, but save to integer-only one; distance 961 @ between the two is 4*(32+4+16-32)=4*(20). 962 963 str @t[3], [sp,#4*(20+32+2)] @ save len 964 add @t[3],sp,#4*(32+4) 965 str r12, [sp,#4*(20+32+1)] @ save inp 966 str r14, [sp,#4*(20+32+0)] @ save out 967 968 ldr @x[12],[sp,#4*(16+10)] 969 ldr @x[14],[sp,#4*(16+11)] 970 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 971 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 972 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 973 974 ldr @t[3], [sp,#4*(15)] 975 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 976 ldr @t[2], [sp,#4*(13)] 977 ldr @x[14],[sp,#4*(14)] 978 str @t[3], [sp,#4*(20+16+15)] 979 add @t[3],sp,#4*(20) 980 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 981 add sp,sp,#4*(20) @ switch frame 982 vst1.32 {$c0-$d0},[@t[3]] 983 mov @t[3],#10 984 b .Loop @ go integer-only 985 986.align 4 987.Ltail_neon: 988 cmp @t[3],#64*3 989 bhs .L192_or_more_neon 990 cmp @t[3],#64*2 991 bhs .L128_or_more_neon 992 cmp @t[3],#64*1 993 bhs .L64_or_more_neon 994 995 add @t[0],sp,#4*(8) 996 vst1.8 {$a0-$b0},[sp] 997 add @t[2],sp,#4*(0) 998 vst1.8 {$c0-$d0},[@t[0]] 999 b .Loop_tail_neon 1000 1001.align 4 1002.L64_or_more_neon: 1003 vld1.8 {$t0-$t1},[r12]! 1004 vld1.8 {$t2-$t3},[r12]! 1005 veor $a0,$a0,$t0 1006 veor $b0,$b0,$t1 1007 veor $c0,$c0,$t2 1008 veor $d0,$d0,$t3 1009 vst1.8 {$a0-$b0},[r14]! 1010 vst1.8 {$c0-$d0},[r14]! 1011 1012 beq .Ldone_neon 1013 1014 add @t[0],sp,#4*(8) 1015 vst1.8 {$a1-$b1},[sp] 1016 add @t[2],sp,#4*(0) 1017 vst1.8 {$c1-$d1},[@t[0]] 1018 sub @t[3],@t[3],#64*1 @ len-=64*1 1019 b .Loop_tail_neon 1020 1021.align 4 1022.L128_or_more_neon: 1023 vld1.8 {$t0-$t1},[r12]! 1024 vld1.8 {$t2-$t3},[r12]! 1025 veor $a0,$a0,$t0 1026 veor $b0,$b0,$t1 1027 vld1.8 {$t0-$t1},[r12]! 1028 veor $c0,$c0,$t2 1029 veor $d0,$d0,$t3 1030 vld1.8 {$t2-$t3},[r12]! 1031 1032 veor $a1,$a1,$t0 1033 veor $b1,$b1,$t1 1034 vst1.8 {$a0-$b0},[r14]! 1035 veor $c1,$c1,$t2 1036 vst1.8 {$c0-$d0},[r14]! 1037 veor $d1,$d1,$t3 1038 vst1.8 {$a1-$b1},[r14]! 1039 vst1.8 {$c1-$d1},[r14]! 1040 1041 beq .Ldone_neon 1042 1043 add @t[0],sp,#4*(8) 1044 vst1.8 {$a2-$b2},[sp] 1045 add @t[2],sp,#4*(0) 1046 vst1.8 {$c2-$d2},[@t[0]] 1047 sub @t[3],@t[3],#64*2 @ len-=64*2 1048 b .Loop_tail_neon 1049 1050.align 4 1051.L192_or_more_neon: 1052 vld1.8 {$t0-$t1},[r12]! 1053 vld1.8 {$t2-$t3},[r12]! 1054 veor $a0,$a0,$t0 1055 veor $b0,$b0,$t1 1056 vld1.8 {$t0-$t1},[r12]! 1057 veor $c0,$c0,$t2 1058 veor $d0,$d0,$t3 1059 vld1.8 {$t2-$t3},[r12]! 1060 1061 veor $a1,$a1,$t0 1062 veor $b1,$b1,$t1 1063 vld1.8 {$t0-$t1},[r12]! 1064 veor $c1,$c1,$t2 1065 vst1.8 {$a0-$b0},[r14]! 1066 veor $d1,$d1,$t3 1067 vld1.8 {$t2-$t3},[r12]! 1068 1069 veor $a2,$a2,$t0 1070 vst1.8 {$c0-$d0},[r14]! 1071 veor $b2,$b2,$t1 1072 vst1.8 {$a1-$b1},[r14]! 1073 veor $c2,$c2,$t2 1074 vst1.8 {$c1-$d1},[r14]! 1075 veor $d2,$d2,$t3 1076 vst1.8 {$a2-$b2},[r14]! 1077 vst1.8 {$c2-$d2},[r14]! 1078 1079 beq .Ldone_neon 1080 1081 ldmia sp,{@t[0]-@t[3]} @ load key material 1082 add @x[0],@x[0],@t[0] @ accumulate key material 1083 add @t[0],sp,#4*(4) 1084 add @x[1],@x[1],@t[1] 1085 add @x[2],@x[2],@t[2] 1086 add @x[3],@x[3],@t[3] 1087 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1088 1089 add @x[4],@x[4],@t[0] @ accumulate key material 1090 add @t[0],sp,#4*(8) 1091 add @x[5],@x[5],@t[1] 1092 add @x[6],@x[6],@t[2] 1093 add @x[7],@x[7],@t[3] 1094 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1095# ifdef __ARMEB__ 1096 rev @x[0],@x[0] 1097 rev @x[1],@x[1] 1098 rev @x[2],@x[2] 1099 rev @x[3],@x[3] 1100 rev @x[4],@x[4] 1101 rev @x[5],@x[5] 1102 rev @x[6],@x[6] 1103 rev @x[7],@x[7] 1104# endif 1105 stmia sp,{@x[0]-@x[7]} 1106 add @x[0],sp,#4*(16+8) 1107 1108 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1109 1110 add @x[0],@x[0],@t[0] @ accumulate key material 1111 add @t[0],sp,#4*(12) 1112 add @x[1],@x[1],@t[1] 1113 add @x[2],@x[2],@t[2] 1114 add @x[3],@x[3],@t[3] 1115 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1116 1117 add @x[4],@x[4],@t[0] @ accumulate key material 1118 add @t[0],sp,#4*(8) 1119 add @x[5],@x[5],@t[1] 1120 add @x[4],@x[4],#3 @ counter+3 1121 add @x[6],@x[6],@t[2] 1122 add @x[7],@x[7],@t[3] 1123 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1124# ifdef __ARMEB__ 1125 rev @x[0],@x[0] 1126 rev @x[1],@x[1] 1127 rev @x[2],@x[2] 1128 rev @x[3],@x[3] 1129 rev @x[4],@x[4] 1130 rev @x[5],@x[5] 1131 rev @x[6],@x[6] 1132 rev @x[7],@x[7] 1133# endif 1134 stmia @t[0],{@x[0]-@x[7]} 1135 add @t[2],sp,#4*(0) 1136 sub @t[3],@t[3],#64*3 @ len-=64*3 1137 1138.Loop_tail_neon: 1139 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1140 ldrb @t[1],[r12],#1 @ read input 1141 subs @t[3],@t[3],#1 1142 eor @t[0],@t[0],@t[1] 1143 strb @t[0],[r14],#1 @ store output 1144 bne .Loop_tail_neon 1145 1146.Ldone_neon: 1147 add sp,sp,#4*(32+4) 1148 vldmia sp,{d8-d15} 1149 add sp,sp,#4*(16+3) 1150 ldmia sp!,{r4-r11,pc} 1151.size ChaCha20_neon,.-ChaCha20_neon 1152.comm OPENSSL_armcap_P,4,4 1153#endif 1154___ 1155}}} 1156 1157foreach (split("\n",$code)) { 1158 s/\`([^\`]*)\`/eval $1/geo; 1159 1160 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1161 1162 print $_,"\n"; 1163} 1164close STDOUT; 1165