1#! /usr/bin/env perl 2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# Kryo 1.26 0.94 1.00 39# 40# (*) original 3.64/1.34/1.32 results were for r0p0 revision 41# and are still same even for updated module; 42 43$flavour = shift; 44$output = shift; 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49die "can't locate arm-xlate.pl"; 50 51open OUT,"| \"$^X\" $xlate $flavour $output"; 52*STDOUT=*OUT; 53 54$prefix="aes_v8"; 55 56$code=<<___; 57#include "arm_arch.h" 58 59#if __ARM_MAX_ARCH__>=7 60.text 61___ 62$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 63$code.=<<___ if ($flavour !~ /64/); 64.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 65.fpu neon 66.code 32 67#undef __thumb2__ 68___ 69 70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 72# maintain both 32- and 64-bit codes within single module and 73# transliterate common code to either flavour with regex vodoo. 74# 75{{{ 76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 78 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 79 80 81$code.=<<___; 82.align 5 83.Lrcon: 84.long 0x01,0x01,0x01,0x01 85.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 86.long 0x1b,0x1b,0x1b,0x1b 87 88.globl ${prefix}_set_encrypt_key 89.type ${prefix}_set_encrypt_key,%function 90.align 5 91${prefix}_set_encrypt_key: 92.Lenc_key: 93___ 94$code.=<<___ if ($flavour =~ /64/); 95 stp x29,x30,[sp,#-16]! 96 add x29,sp,#0 97___ 98$code.=<<___; 99 mov $ptr,#-1 100 cmp $inp,#0 101 b.eq .Lenc_key_abort 102 cmp $out,#0 103 b.eq .Lenc_key_abort 104 mov $ptr,#-2 105 cmp $bits,#128 106 b.lt .Lenc_key_abort 107 cmp $bits,#256 108 b.gt .Lenc_key_abort 109 tst $bits,#0x3f 110 b.ne .Lenc_key_abort 111 112 adr $ptr,.Lrcon 113 cmp $bits,#192 114 115 veor $zero,$zero,$zero 116 vld1.8 {$in0},[$inp],#16 117 mov $bits,#8 // reuse $bits 118 vld1.32 {$rcon,$mask},[$ptr],#32 119 120 b.lt .Loop128 121 b.eq .L192 122 b .L256 123 124.align 4 125.Loop128: 126 vtbl.8 $key,{$in0},$mask 127 vext.8 $tmp,$zero,$in0,#12 128 vst1.32 {$in0},[$out],#16 129 aese $key,$zero 130 subs $bits,$bits,#1 131 132 veor $in0,$in0,$tmp 133 vext.8 $tmp,$zero,$tmp,#12 134 veor $in0,$in0,$tmp 135 vext.8 $tmp,$zero,$tmp,#12 136 veor $key,$key,$rcon 137 veor $in0,$in0,$tmp 138 vshl.u8 $rcon,$rcon,#1 139 veor $in0,$in0,$key 140 b.ne .Loop128 141 142 vld1.32 {$rcon},[$ptr] 143 144 vtbl.8 $key,{$in0},$mask 145 vext.8 $tmp,$zero,$in0,#12 146 vst1.32 {$in0},[$out],#16 147 aese $key,$zero 148 149 veor $in0,$in0,$tmp 150 vext.8 $tmp,$zero,$tmp,#12 151 veor $in0,$in0,$tmp 152 vext.8 $tmp,$zero,$tmp,#12 153 veor $key,$key,$rcon 154 veor $in0,$in0,$tmp 155 vshl.u8 $rcon,$rcon,#1 156 veor $in0,$in0,$key 157 158 vtbl.8 $key,{$in0},$mask 159 vext.8 $tmp,$zero,$in0,#12 160 vst1.32 {$in0},[$out],#16 161 aese $key,$zero 162 163 veor $in0,$in0,$tmp 164 vext.8 $tmp,$zero,$tmp,#12 165 veor $in0,$in0,$tmp 166 vext.8 $tmp,$zero,$tmp,#12 167 veor $key,$key,$rcon 168 veor $in0,$in0,$tmp 169 veor $in0,$in0,$key 170 vst1.32 {$in0},[$out] 171 add $out,$out,#0x50 172 173 mov $rounds,#10 174 b .Ldone 175 176.align 4 177.L192: 178 vld1.8 {$in1},[$inp],#8 179 vmov.i8 $key,#8 // borrow $key 180 vst1.32 {$in0},[$out],#16 181 vsub.i8 $mask,$mask,$key // adjust the mask 182 183.Loop192: 184 vtbl.8 $key,{$in1},$mask 185 vext.8 $tmp,$zero,$in0,#12 186#ifdef __ARMEB__ 187 vst1.32 {$in1},[$out],#16 188 sub $out,$out,#8 189#else 190 vst1.32 {$in1},[$out],#8 191#endif 192 aese $key,$zero 193 subs $bits,$bits,#1 194 195 veor $in0,$in0,$tmp 196 vext.8 $tmp,$zero,$tmp,#12 197 veor $in0,$in0,$tmp 198 vext.8 $tmp,$zero,$tmp,#12 199 veor $in0,$in0,$tmp 200 201 vdup.32 $tmp,${in0}[3] 202 veor $tmp,$tmp,$in1 203 veor $key,$key,$rcon 204 vext.8 $in1,$zero,$in1,#12 205 vshl.u8 $rcon,$rcon,#1 206 veor $in1,$in1,$tmp 207 veor $in0,$in0,$key 208 veor $in1,$in1,$key 209 vst1.32 {$in0},[$out],#16 210 b.ne .Loop192 211 212 mov $rounds,#12 213 add $out,$out,#0x20 214 b .Ldone 215 216.align 4 217.L256: 218 vld1.8 {$in1},[$inp] 219 mov $bits,#7 220 mov $rounds,#14 221 vst1.32 {$in0},[$out],#16 222 223.Loop256: 224 vtbl.8 $key,{$in1},$mask 225 vext.8 $tmp,$zero,$in0,#12 226 vst1.32 {$in1},[$out],#16 227 aese $key,$zero 228 subs $bits,$bits,#1 229 230 veor $in0,$in0,$tmp 231 vext.8 $tmp,$zero,$tmp,#12 232 veor $in0,$in0,$tmp 233 vext.8 $tmp,$zero,$tmp,#12 234 veor $key,$key,$rcon 235 veor $in0,$in0,$tmp 236 vshl.u8 $rcon,$rcon,#1 237 veor $in0,$in0,$key 238 vst1.32 {$in0},[$out],#16 239 b.eq .Ldone 240 241 vdup.32 $key,${in0}[3] // just splat 242 vext.8 $tmp,$zero,$in1,#12 243 aese $key,$zero 244 245 veor $in1,$in1,$tmp 246 vext.8 $tmp,$zero,$tmp,#12 247 veor $in1,$in1,$tmp 248 vext.8 $tmp,$zero,$tmp,#12 249 veor $in1,$in1,$tmp 250 251 veor $in1,$in1,$key 252 b .Loop256 253 254.Ldone: 255 str $rounds,[$out] 256 mov $ptr,#0 257 258.Lenc_key_abort: 259 mov x0,$ptr // return value 260 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 261 ret 262.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 263 264.globl ${prefix}_set_decrypt_key 265.type ${prefix}_set_decrypt_key,%function 266.align 5 267${prefix}_set_decrypt_key: 268___ 269$code.=<<___ if ($flavour =~ /64/); 270 .inst 0xd503233f // paciasp 271 stp x29,x30,[sp,#-16]! 272 add x29,sp,#0 273___ 274$code.=<<___ if ($flavour !~ /64/); 275 stmdb sp!,{r4,lr} 276___ 277$code.=<<___; 278 bl .Lenc_key 279 280 cmp x0,#0 281 b.ne .Ldec_key_abort 282 283 sub $out,$out,#240 // restore original $out 284 mov x4,#-16 285 add $inp,$out,x12,lsl#4 // end of key schedule 286 287 vld1.32 {v0.16b},[$out] 288 vld1.32 {v1.16b},[$inp] 289 vst1.32 {v0.16b},[$inp],x4 290 vst1.32 {v1.16b},[$out],#16 291 292.Loop_imc: 293 vld1.32 {v0.16b},[$out] 294 vld1.32 {v1.16b},[$inp] 295 aesimc v0.16b,v0.16b 296 aesimc v1.16b,v1.16b 297 vst1.32 {v0.16b},[$inp],x4 298 vst1.32 {v1.16b},[$out],#16 299 cmp $inp,$out 300 b.hi .Loop_imc 301 302 vld1.32 {v0.16b},[$out] 303 aesimc v0.16b,v0.16b 304 vst1.32 {v0.16b},[$inp] 305 306 eor x0,x0,x0 // return value 307.Ldec_key_abort: 308___ 309$code.=<<___ if ($flavour !~ /64/); 310 ldmia sp!,{r4,pc} 311___ 312$code.=<<___ if ($flavour =~ /64/); 313 ldp x29,x30,[sp],#16 314 .inst 0xd50323bf // autiasp 315 ret 316___ 317$code.=<<___; 318.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 319___ 320}}} 321{{{ 322sub gen_block () { 323my $dir = shift; 324my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 325my ($inp,$out,$key)=map("x$_",(0..2)); 326my $rounds="w3"; 327my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 328 329$code.=<<___; 330.globl ${prefix}_${dir}crypt 331.type ${prefix}_${dir}crypt,%function 332.align 5 333${prefix}_${dir}crypt: 334 ldr $rounds,[$key,#240] 335 vld1.32 {$rndkey0},[$key],#16 336 vld1.8 {$inout},[$inp] 337 sub $rounds,$rounds,#2 338 vld1.32 {$rndkey1},[$key],#16 339 340.Loop_${dir}c: 341 aes$e $inout,$rndkey0 342 aes$mc $inout,$inout 343 vld1.32 {$rndkey0},[$key],#16 344 subs $rounds,$rounds,#2 345 aes$e $inout,$rndkey1 346 aes$mc $inout,$inout 347 vld1.32 {$rndkey1},[$key],#16 348 b.gt .Loop_${dir}c 349 350 aes$e $inout,$rndkey0 351 aes$mc $inout,$inout 352 vld1.32 {$rndkey0},[$key] 353 aes$e $inout,$rndkey1 354 veor $inout,$inout,$rndkey0 355 356 vst1.8 {$inout},[$out] 357 ret 358.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 359___ 360} 361&gen_block("en"); 362&gen_block("de"); 363}}} 364{{{ 365my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 366my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 367my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 368 369my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 370my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 371 372### q8-q15 preloaded key schedule 373 374$code.=<<___; 375.globl ${prefix}_cbc_encrypt 376.type ${prefix}_cbc_encrypt,%function 377.align 5 378${prefix}_cbc_encrypt: 379___ 380$code.=<<___ if ($flavour =~ /64/); 381 stp x29,x30,[sp,#-16]! 382 add x29,sp,#0 383___ 384$code.=<<___ if ($flavour !~ /64/); 385 mov ip,sp 386 stmdb sp!,{r4-r8,lr} 387 vstmdb sp!,{d8-d15} @ ABI specification says so 388 ldmia ip,{r4-r5} @ load remaining args 389___ 390$code.=<<___; 391 subs $len,$len,#16 392 mov $step,#16 393 b.lo .Lcbc_abort 394 cclr $step,eq 395 396 cmp $enc,#0 // en- or decrypting? 397 ldr $rounds,[$key,#240] 398 and $len,$len,#-16 399 vld1.8 {$ivec},[$ivp] 400 vld1.8 {$dat},[$inp],$step 401 402 vld1.32 {q8-q9},[$key] // load key schedule... 403 sub $rounds,$rounds,#6 404 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 405 sub $rounds,$rounds,#2 406 vld1.32 {q10-q11},[$key_],#32 407 vld1.32 {q12-q13},[$key_],#32 408 vld1.32 {q14-q15},[$key_],#32 409 vld1.32 {$rndlast},[$key_] 410 411 add $key_,$key,#32 412 mov $cnt,$rounds 413 b.eq .Lcbc_dec 414 415 cmp $rounds,#2 416 veor $dat,$dat,$ivec 417 veor $rndzero_n_last,q8,$rndlast 418 b.eq .Lcbc_enc128 419 420 vld1.32 {$in0-$in1},[$key_] 421 add $key_,$key,#16 422 add $key4,$key,#16*4 423 add $key5,$key,#16*5 424 aese $dat,q8 425 aesmc $dat,$dat 426 add $key6,$key,#16*6 427 add $key7,$key,#16*7 428 b .Lenter_cbc_enc 429 430.align 4 431.Loop_cbc_enc: 432 aese $dat,q8 433 aesmc $dat,$dat 434 vst1.8 {$ivec},[$out],#16 435.Lenter_cbc_enc: 436 aese $dat,q9 437 aesmc $dat,$dat 438 aese $dat,$in0 439 aesmc $dat,$dat 440 vld1.32 {q8},[$key4] 441 cmp $rounds,#4 442 aese $dat,$in1 443 aesmc $dat,$dat 444 vld1.32 {q9},[$key5] 445 b.eq .Lcbc_enc192 446 447 aese $dat,q8 448 aesmc $dat,$dat 449 vld1.32 {q8},[$key6] 450 aese $dat,q9 451 aesmc $dat,$dat 452 vld1.32 {q9},[$key7] 453 nop 454 455.Lcbc_enc192: 456 aese $dat,q8 457 aesmc $dat,$dat 458 subs $len,$len,#16 459 aese $dat,q9 460 aesmc $dat,$dat 461 cclr $step,eq 462 aese $dat,q10 463 aesmc $dat,$dat 464 aese $dat,q11 465 aesmc $dat,$dat 466 vld1.8 {q8},[$inp],$step 467 aese $dat,q12 468 aesmc $dat,$dat 469 veor q8,q8,$rndzero_n_last 470 aese $dat,q13 471 aesmc $dat,$dat 472 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 473 aese $dat,q14 474 aesmc $dat,$dat 475 aese $dat,q15 476 veor $ivec,$dat,$rndlast 477 b.hs .Loop_cbc_enc 478 479 vst1.8 {$ivec},[$out],#16 480 b .Lcbc_done 481 482.align 5 483.Lcbc_enc128: 484 vld1.32 {$in0-$in1},[$key_] 485 aese $dat,q8 486 aesmc $dat,$dat 487 b .Lenter_cbc_enc128 488.Loop_cbc_enc128: 489 aese $dat,q8 490 aesmc $dat,$dat 491 vst1.8 {$ivec},[$out],#16 492.Lenter_cbc_enc128: 493 aese $dat,q9 494 aesmc $dat,$dat 495 subs $len,$len,#16 496 aese $dat,$in0 497 aesmc $dat,$dat 498 cclr $step,eq 499 aese $dat,$in1 500 aesmc $dat,$dat 501 aese $dat,q10 502 aesmc $dat,$dat 503 aese $dat,q11 504 aesmc $dat,$dat 505 vld1.8 {q8},[$inp],$step 506 aese $dat,q12 507 aesmc $dat,$dat 508 aese $dat,q13 509 aesmc $dat,$dat 510 aese $dat,q14 511 aesmc $dat,$dat 512 veor q8,q8,$rndzero_n_last 513 aese $dat,q15 514 veor $ivec,$dat,$rndlast 515 b.hs .Loop_cbc_enc128 516 517 vst1.8 {$ivec},[$out],#16 518 b .Lcbc_done 519___ 520{ 521my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 522$code.=<<___; 523.align 5 524.Lcbc_dec: 525 vld1.8 {$dat2},[$inp],#16 526 subs $len,$len,#32 // bias 527 add $cnt,$rounds,#2 528 vorr $in1,$dat,$dat 529 vorr $dat1,$dat,$dat 530 vorr $in2,$dat2,$dat2 531 b.lo .Lcbc_dec_tail 532 533 vorr $dat1,$dat2,$dat2 534 vld1.8 {$dat2},[$inp],#16 535 vorr $in0,$dat,$dat 536 vorr $in1,$dat1,$dat1 537 vorr $in2,$dat2,$dat2 538 539.Loop3x_cbc_dec: 540 aesd $dat0,q8 541 aesimc $dat0,$dat0 542 aesd $dat1,q8 543 aesimc $dat1,$dat1 544 aesd $dat2,q8 545 aesimc $dat2,$dat2 546 vld1.32 {q8},[$key_],#16 547 subs $cnt,$cnt,#2 548 aesd $dat0,q9 549 aesimc $dat0,$dat0 550 aesd $dat1,q9 551 aesimc $dat1,$dat1 552 aesd $dat2,q9 553 aesimc $dat2,$dat2 554 vld1.32 {q9},[$key_],#16 555 b.gt .Loop3x_cbc_dec 556 557 aesd $dat0,q8 558 aesimc $dat0,$dat0 559 aesd $dat1,q8 560 aesimc $dat1,$dat1 561 aesd $dat2,q8 562 aesimc $dat2,$dat2 563 veor $tmp0,$ivec,$rndlast 564 subs $len,$len,#0x30 565 veor $tmp1,$in0,$rndlast 566 mov.lo x6,$len // x6, $cnt, is zero at this point 567 aesd $dat0,q9 568 aesimc $dat0,$dat0 569 aesd $dat1,q9 570 aesimc $dat1,$dat1 571 aesd $dat2,q9 572 aesimc $dat2,$dat2 573 veor $tmp2,$in1,$rndlast 574 add $inp,$inp,x6 // $inp is adjusted in such way that 575 // at exit from the loop $dat1-$dat2 576 // are loaded with last "words" 577 vorr $ivec,$in2,$in2 578 mov $key_,$key 579 aesd $dat0,q12 580 aesimc $dat0,$dat0 581 aesd $dat1,q12 582 aesimc $dat1,$dat1 583 aesd $dat2,q12 584 aesimc $dat2,$dat2 585 vld1.8 {$in0},[$inp],#16 586 aesd $dat0,q13 587 aesimc $dat0,$dat0 588 aesd $dat1,q13 589 aesimc $dat1,$dat1 590 aesd $dat2,q13 591 aesimc $dat2,$dat2 592 vld1.8 {$in1},[$inp],#16 593 aesd $dat0,q14 594 aesimc $dat0,$dat0 595 aesd $dat1,q14 596 aesimc $dat1,$dat1 597 aesd $dat2,q14 598 aesimc $dat2,$dat2 599 vld1.8 {$in2},[$inp],#16 600 aesd $dat0,q15 601 aesd $dat1,q15 602 aesd $dat2,q15 603 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 604 add $cnt,$rounds,#2 605 veor $tmp0,$tmp0,$dat0 606 veor $tmp1,$tmp1,$dat1 607 veor $dat2,$dat2,$tmp2 608 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 609 vst1.8 {$tmp0},[$out],#16 610 vorr $dat0,$in0,$in0 611 vst1.8 {$tmp1},[$out],#16 612 vorr $dat1,$in1,$in1 613 vst1.8 {$dat2},[$out],#16 614 vorr $dat2,$in2,$in2 615 b.hs .Loop3x_cbc_dec 616 617 cmn $len,#0x30 618 b.eq .Lcbc_done 619 nop 620 621.Lcbc_dec_tail: 622 aesd $dat1,q8 623 aesimc $dat1,$dat1 624 aesd $dat2,q8 625 aesimc $dat2,$dat2 626 vld1.32 {q8},[$key_],#16 627 subs $cnt,$cnt,#2 628 aesd $dat1,q9 629 aesimc $dat1,$dat1 630 aesd $dat2,q9 631 aesimc $dat2,$dat2 632 vld1.32 {q9},[$key_],#16 633 b.gt .Lcbc_dec_tail 634 635 aesd $dat1,q8 636 aesimc $dat1,$dat1 637 aesd $dat2,q8 638 aesimc $dat2,$dat2 639 aesd $dat1,q9 640 aesimc $dat1,$dat1 641 aesd $dat2,q9 642 aesimc $dat2,$dat2 643 aesd $dat1,q12 644 aesimc $dat1,$dat1 645 aesd $dat2,q12 646 aesimc $dat2,$dat2 647 cmn $len,#0x20 648 aesd $dat1,q13 649 aesimc $dat1,$dat1 650 aesd $dat2,q13 651 aesimc $dat2,$dat2 652 veor $tmp1,$ivec,$rndlast 653 aesd $dat1,q14 654 aesimc $dat1,$dat1 655 aesd $dat2,q14 656 aesimc $dat2,$dat2 657 veor $tmp2,$in1,$rndlast 658 aesd $dat1,q15 659 aesd $dat2,q15 660 b.eq .Lcbc_dec_one 661 veor $tmp1,$tmp1,$dat1 662 veor $tmp2,$tmp2,$dat2 663 vorr $ivec,$in2,$in2 664 vst1.8 {$tmp1},[$out],#16 665 vst1.8 {$tmp2},[$out],#16 666 b .Lcbc_done 667 668.Lcbc_dec_one: 669 veor $tmp1,$tmp1,$dat2 670 vorr $ivec,$in2,$in2 671 vst1.8 {$tmp1},[$out],#16 672 673.Lcbc_done: 674 vst1.8 {$ivec},[$ivp] 675.Lcbc_abort: 676___ 677} 678$code.=<<___ if ($flavour !~ /64/); 679 vldmia sp!,{d8-d15} 680 ldmia sp!,{r4-r8,pc} 681___ 682$code.=<<___ if ($flavour =~ /64/); 683 ldr x29,[sp],#16 684 ret 685___ 686$code.=<<___; 687.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 688___ 689}}} 690{{{ 691my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 692my ($rounds,$cnt,$key_)=("w5","w6","x7"); 693my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 694my $step="x12"; # aliases with $tctr2 695 696my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 697my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 698 699my ($dat,$tmp)=($dat0,$tmp0); 700 701### q8-q15 preloaded key schedule 702 703$code.=<<___; 704.globl ${prefix}_ctr32_encrypt_blocks 705.type ${prefix}_ctr32_encrypt_blocks,%function 706.align 5 707${prefix}_ctr32_encrypt_blocks: 708___ 709$code.=<<___ if ($flavour =~ /64/); 710 stp x29,x30,[sp,#-16]! 711 add x29,sp,#0 712___ 713$code.=<<___ if ($flavour !~ /64/); 714 mov ip,sp 715 stmdb sp!,{r4-r10,lr} 716 vstmdb sp!,{d8-d15} @ ABI specification says so 717 ldr r4, [ip] @ load remaining arg 718___ 719$code.=<<___; 720 ldr $rounds,[$key,#240] 721 722 ldr $ctr, [$ivp, #12] 723#ifdef __ARMEB__ 724 vld1.8 {$dat0},[$ivp] 725#else 726 vld1.32 {$dat0},[$ivp] 727#endif 728 vld1.32 {q8-q9},[$key] // load key schedule... 729 sub $rounds,$rounds,#4 730 mov $step,#16 731 cmp $len,#2 732 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 733 sub $rounds,$rounds,#2 734 vld1.32 {q12-q13},[$key_],#32 735 vld1.32 {q14-q15},[$key_],#32 736 vld1.32 {$rndlast},[$key_] 737 add $key_,$key,#32 738 mov $cnt,$rounds 739 cclr $step,lo 740#ifndef __ARMEB__ 741 rev $ctr, $ctr 742#endif 743 add $tctr1, $ctr, #1 744 vorr $ivec,$dat0,$dat0 745 rev $tctr1, $tctr1 746 vmov.32 ${ivec}[3],$tctr1 747 add $ctr, $ctr, #2 748 vorr $dat1,$ivec,$ivec 749 b.ls .Lctr32_tail 750 rev $tctr2, $ctr 751 vmov.32 ${ivec}[3],$tctr2 752 sub $len,$len,#3 // bias 753 vorr $dat2,$ivec,$ivec 754 b .Loop3x_ctr32 755 756.align 4 757.Loop3x_ctr32: 758 aese $dat0,q8 759 aesmc $dat0,$dat0 760 aese $dat1,q8 761 aesmc $dat1,$dat1 762 aese $dat2,q8 763 aesmc $dat2,$dat2 764 vld1.32 {q8},[$key_],#16 765 subs $cnt,$cnt,#2 766 aese $dat0,q9 767 aesmc $dat0,$dat0 768 aese $dat1,q9 769 aesmc $dat1,$dat1 770 aese $dat2,q9 771 aesmc $dat2,$dat2 772 vld1.32 {q9},[$key_],#16 773 b.gt .Loop3x_ctr32 774 775 aese $dat0,q8 776 aesmc $tmp0,$dat0 777 aese $dat1,q8 778 aesmc $tmp1,$dat1 779 vld1.8 {$in0},[$inp],#16 780 add $tctr0,$ctr,#1 781 aese $dat2,q8 782 aesmc $dat2,$dat2 783 vld1.8 {$in1},[$inp],#16 784 rev $tctr0,$tctr0 785 aese $tmp0,q9 786 aesmc $tmp0,$tmp0 787 aese $tmp1,q9 788 aesmc $tmp1,$tmp1 789 vld1.8 {$in2},[$inp],#16 790 mov $key_,$key 791 aese $dat2,q9 792 aesmc $tmp2,$dat2 793 aese $tmp0,q12 794 aesmc $tmp0,$tmp0 795 aese $tmp1,q12 796 aesmc $tmp1,$tmp1 797 veor $in0,$in0,$rndlast 798 add $tctr1,$ctr,#2 799 aese $tmp2,q12 800 aesmc $tmp2,$tmp2 801 veor $in1,$in1,$rndlast 802 add $ctr,$ctr,#3 803 aese $tmp0,q13 804 aesmc $tmp0,$tmp0 805 aese $tmp1,q13 806 aesmc $tmp1,$tmp1 807 veor $in2,$in2,$rndlast 808 vmov.32 ${ivec}[3], $tctr0 809 aese $tmp2,q13 810 aesmc $tmp2,$tmp2 811 vorr $dat0,$ivec,$ivec 812 rev $tctr1,$tctr1 813 aese $tmp0,q14 814 aesmc $tmp0,$tmp0 815 vmov.32 ${ivec}[3], $tctr1 816 rev $tctr2,$ctr 817 aese $tmp1,q14 818 aesmc $tmp1,$tmp1 819 vorr $dat1,$ivec,$ivec 820 vmov.32 ${ivec}[3], $tctr2 821 aese $tmp2,q14 822 aesmc $tmp2,$tmp2 823 vorr $dat2,$ivec,$ivec 824 subs $len,$len,#3 825 aese $tmp0,q15 826 aese $tmp1,q15 827 aese $tmp2,q15 828 829 veor $in0,$in0,$tmp0 830 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 831 vst1.8 {$in0},[$out],#16 832 veor $in1,$in1,$tmp1 833 mov $cnt,$rounds 834 vst1.8 {$in1},[$out],#16 835 veor $in2,$in2,$tmp2 836 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 837 vst1.8 {$in2},[$out],#16 838 b.hs .Loop3x_ctr32 839 840 adds $len,$len,#3 841 b.eq .Lctr32_done 842 cmp $len,#1 843 mov $step,#16 844 cclr $step,eq 845 846.Lctr32_tail: 847 aese $dat0,q8 848 aesmc $dat0,$dat0 849 aese $dat1,q8 850 aesmc $dat1,$dat1 851 vld1.32 {q8},[$key_],#16 852 subs $cnt,$cnt,#2 853 aese $dat0,q9 854 aesmc $dat0,$dat0 855 aese $dat1,q9 856 aesmc $dat1,$dat1 857 vld1.32 {q9},[$key_],#16 858 b.gt .Lctr32_tail 859 860 aese $dat0,q8 861 aesmc $dat0,$dat0 862 aese $dat1,q8 863 aesmc $dat1,$dat1 864 aese $dat0,q9 865 aesmc $dat0,$dat0 866 aese $dat1,q9 867 aesmc $dat1,$dat1 868 vld1.8 {$in0},[$inp],$step 869 aese $dat0,q12 870 aesmc $dat0,$dat0 871 aese $dat1,q12 872 aesmc $dat1,$dat1 873 vld1.8 {$in1},[$inp] 874 aese $dat0,q13 875 aesmc $dat0,$dat0 876 aese $dat1,q13 877 aesmc $dat1,$dat1 878 veor $in0,$in0,$rndlast 879 aese $dat0,q14 880 aesmc $dat0,$dat0 881 aese $dat1,q14 882 aesmc $dat1,$dat1 883 veor $in1,$in1,$rndlast 884 aese $dat0,q15 885 aese $dat1,q15 886 887 cmp $len,#1 888 veor $in0,$in0,$dat0 889 veor $in1,$in1,$dat1 890 vst1.8 {$in0},[$out],#16 891 b.eq .Lctr32_done 892 vst1.8 {$in1},[$out] 893 894.Lctr32_done: 895___ 896$code.=<<___ if ($flavour !~ /64/); 897 vldmia sp!,{d8-d15} 898 ldmia sp!,{r4-r10,pc} 899___ 900$code.=<<___ if ($flavour =~ /64/); 901 ldr x29,[sp],#16 902 ret 903___ 904$code.=<<___; 905.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 906___ 907}}} 908$code.=<<___; 909#endif 910___ 911######################################## 912if ($flavour =~ /64/) { ######## 64-bit code 913 my %opcode = ( 914 "aesd" => 0x4e285800, "aese" => 0x4e284800, 915 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 916 917 local *unaes = sub { 918 my ($mnemonic,$arg)=@_; 919 920 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 921 sprintf ".inst\t0x%08x\t//%s %s", 922 $opcode{$mnemonic}|$1|($2<<5), 923 $mnemonic,$arg; 924 }; 925 926 foreach(split("\n",$code)) { 927 s/\`([^\`]*)\`/eval($1)/geo; 928 929 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 930 s/@\s/\/\//o; # old->new style commentary 931 932 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 933 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 934 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 935 s/vmov\.i8/movi/o or # fix up legacy mnemonics 936 s/vext\.8/ext/o or 937 s/vrev32\.8/rev32/o or 938 s/vtst\.8/cmtst/o or 939 s/vshr/ushr/o or 940 s/^(\s+)v/$1/o or # strip off v prefix 941 s/\bbx\s+lr\b/ret/o; 942 943 # fix up remaining legacy suffixes 944 s/\.[ui]?8//o; 945 m/\],#8/o and s/\.16b/\.8b/go; 946 s/\.[ui]?32//o and s/\.16b/\.4s/go; 947 s/\.[ui]?64//o and s/\.16b/\.2d/go; 948 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 949 950 print $_,"\n"; 951 } 952} else { ######## 32-bit code 953 my %opcode = ( 954 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 955 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 956 957 local *unaes = sub { 958 my ($mnemonic,$arg)=@_; 959 960 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 961 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 962 |(($2&7)<<1) |(($2&8)<<2); 963 # since ARMv7 instructions are always encoded little-endian. 964 # correct solution is to use .inst directive, but older 965 # assemblers don't implement it:-( 966 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 967 $word&0xff,($word>>8)&0xff, 968 ($word>>16)&0xff,($word>>24)&0xff, 969 $mnemonic,$arg; 970 } 971 }; 972 973 sub unvtbl { 974 my $arg=shift; 975 976 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 977 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 978 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 979 } 980 981 sub unvdup32 { 982 my $arg=shift; 983 984 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 985 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 986 } 987 988 sub unvmov32 { 989 my $arg=shift; 990 991 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 992 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 993 } 994 995 foreach(split("\n",$code)) { 996 s/\`([^\`]*)\`/eval($1)/geo; 997 998 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 999 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 1000 s/\/\/\s?/@ /o; # new->old style commentary 1001 1002 # fix up remaining new-style suffixes 1003 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 1004 s/\],#[0-9]+/]!/o; 1005 1006 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 1007 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 1008 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 1009 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1010 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1011 s/^(\s+)b\./$1b/o or 1012 s/^(\s+)mov\./$1mov/o or 1013 s/^(\s+)ret/$1bx\tlr/o; 1014 1015 print $_,"\n"; 1016 } 1017} 1018 1019close STDOUT or die "error closing STDOUT: $!"; 1020