1#! /usr/bin/env perl 2# Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# April 2019 31# 32# Key to performance of parallelize-able modes is round instruction 33# interleaving. But which factor to use? There is optimal one for 34# each combination of instruction latency and issue rate, beyond 35# which increasing interleave factor doesn't pay off. While on cons 36# side we have code size increase and resource waste on platforms for 37# which interleave factor is too high. In other words you want it to 38# be just right. So far interleave factor of 3x was serving well all 39# platforms. But for ThunderX2 optimal interleave factor was measured 40# to be 5x... 41# 42# Performance in cycles per byte processed with 128-bit key: 43# 44# CBC enc CBC dec CTR 45# Apple A7 2.39 1.20 1.20 46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 48# Cortex-A72 1.33 0.85/0.88 0.92/0.96 49# Denver 1.96 0.65/0.86 0.76/0.80 50# Mongoose 1.33 1.23/1.20 1.30/1.20 51# Kryo 1.26 0.87/0.94 1.00/1.00 52# ThunderX2 5.95 1.25 1.30 53# 54# (*) original 3.64/1.34/1.32 results were for r0p0 revision 55# and are still same even for updated module; 56# (**) numbers after slash are for 32-bit code, which is 3x- 57# interleaved; 58 59# $output is the last argument if it looks like a file (it has an extension) 60# $flavour is the first argument if it doesn't look like a file 61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71*STDOUT=*OUT; 72 73$prefix="aes_v8"; 74 75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 76 77$code=<<___; 78#include "arm_arch.h" 79 80#if __ARM_MAX_ARCH__>=7 81___ 82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 83$code.=<<___ if ($flavour !~ /64/); 84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 85.fpu neon 86#ifdef __thumb2__ 87.syntax unified 88.thumb 89# define INST(a,b,c,d) $_byte c,d|0xc,a,b 90#else 91.code 32 92# define INST(a,b,c,d) $_byte a,b,c,d 93#endif 94 95.text 96___ 97 98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 100# maintain both 32- and 64-bit codes within single module and 101# transliterate common code to either flavour with regex vodoo. 102# 103{{{ 104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 107 108 109$code.=<<___; 110.align 5 111.Lrcon: 112.long 0x01,0x01,0x01,0x01 113.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 114.long 0x1b,0x1b,0x1b,0x1b 115 116.globl ${prefix}_set_encrypt_key 117.type ${prefix}_set_encrypt_key,%function 118.align 5 119${prefix}_set_encrypt_key: 120.Lenc_key: 121___ 122$code.=<<___ if ($flavour =~ /64/); 123 stp x29,x30,[sp,#-16]! 124 add x29,sp,#0 125___ 126$code.=<<___; 127 mov $ptr,#-1 128 cmp $inp,#0 129 b.eq .Lenc_key_abort 130 cmp $out,#0 131 b.eq .Lenc_key_abort 132 mov $ptr,#-2 133 cmp $bits,#128 134 b.lt .Lenc_key_abort 135 cmp $bits,#256 136 b.gt .Lenc_key_abort 137 tst $bits,#0x3f 138 b.ne .Lenc_key_abort 139 140 adr $ptr,.Lrcon 141 cmp $bits,#192 142 143 veor $zero,$zero,$zero 144 vld1.8 {$in0},[$inp],#16 145 mov $bits,#8 // reuse $bits 146 vld1.32 {$rcon,$mask},[$ptr],#32 147 148 b.lt .Loop128 149 b.eq .L192 150 b .L256 151 152.align 4 153.Loop128: 154 vtbl.8 $key,{$in0},$mask 155 vext.8 $tmp,$zero,$in0,#12 156 vst1.32 {$in0},[$out],#16 157 aese $key,$zero 158 subs $bits,$bits,#1 159 160 veor $in0,$in0,$tmp 161 vext.8 $tmp,$zero,$tmp,#12 162 veor $in0,$in0,$tmp 163 vext.8 $tmp,$zero,$tmp,#12 164 veor $key,$key,$rcon 165 veor $in0,$in0,$tmp 166 vshl.u8 $rcon,$rcon,#1 167 veor $in0,$in0,$key 168 b.ne .Loop128 169 170 vld1.32 {$rcon},[$ptr] 171 172 vtbl.8 $key,{$in0},$mask 173 vext.8 $tmp,$zero,$in0,#12 174 vst1.32 {$in0},[$out],#16 175 aese $key,$zero 176 177 veor $in0,$in0,$tmp 178 vext.8 $tmp,$zero,$tmp,#12 179 veor $in0,$in0,$tmp 180 vext.8 $tmp,$zero,$tmp,#12 181 veor $key,$key,$rcon 182 veor $in0,$in0,$tmp 183 vshl.u8 $rcon,$rcon,#1 184 veor $in0,$in0,$key 185 186 vtbl.8 $key,{$in0},$mask 187 vext.8 $tmp,$zero,$in0,#12 188 vst1.32 {$in0},[$out],#16 189 aese $key,$zero 190 191 veor $in0,$in0,$tmp 192 vext.8 $tmp,$zero,$tmp,#12 193 veor $in0,$in0,$tmp 194 vext.8 $tmp,$zero,$tmp,#12 195 veor $key,$key,$rcon 196 veor $in0,$in0,$tmp 197 veor $in0,$in0,$key 198 vst1.32 {$in0},[$out] 199 add $out,$out,#0x50 200 201 mov $rounds,#10 202 b .Ldone 203 204.align 4 205.L192: 206 vld1.8 {$in1},[$inp],#8 207 vmov.i8 $key,#8 // borrow $key 208 vst1.32 {$in0},[$out],#16 209 vsub.i8 $mask,$mask,$key // adjust the mask 210 211.Loop192: 212 vtbl.8 $key,{$in1},$mask 213 vext.8 $tmp,$zero,$in0,#12 214#ifdef __ARMEB__ 215 vst1.32 {$in1},[$out],#16 216 sub $out,$out,#8 217#else 218 vst1.32 {$in1},[$out],#8 219#endif 220 aese $key,$zero 221 subs $bits,$bits,#1 222 223 veor $in0,$in0,$tmp 224 vext.8 $tmp,$zero,$tmp,#12 225 veor $in0,$in0,$tmp 226 vext.8 $tmp,$zero,$tmp,#12 227 veor $in0,$in0,$tmp 228 229 vdup.32 $tmp,${in0}[3] 230 veor $tmp,$tmp,$in1 231 veor $key,$key,$rcon 232 vext.8 $in1,$zero,$in1,#12 233 vshl.u8 $rcon,$rcon,#1 234 veor $in1,$in1,$tmp 235 veor $in0,$in0,$key 236 veor $in1,$in1,$key 237 vst1.32 {$in0},[$out],#16 238 b.ne .Loop192 239 240 mov $rounds,#12 241 add $out,$out,#0x20 242 b .Ldone 243 244.align 4 245.L256: 246 vld1.8 {$in1},[$inp] 247 mov $bits,#7 248 mov $rounds,#14 249 vst1.32 {$in0},[$out],#16 250 251.Loop256: 252 vtbl.8 $key,{$in1},$mask 253 vext.8 $tmp,$zero,$in0,#12 254 vst1.32 {$in1},[$out],#16 255 aese $key,$zero 256 subs $bits,$bits,#1 257 258 veor $in0,$in0,$tmp 259 vext.8 $tmp,$zero,$tmp,#12 260 veor $in0,$in0,$tmp 261 vext.8 $tmp,$zero,$tmp,#12 262 veor $key,$key,$rcon 263 veor $in0,$in0,$tmp 264 vshl.u8 $rcon,$rcon,#1 265 veor $in0,$in0,$key 266 vst1.32 {$in0},[$out],#16 267 b.eq .Ldone 268 269 vdup.32 $key,${in0}[3] // just splat 270 vext.8 $tmp,$zero,$in1,#12 271 aese $key,$zero 272 273 veor $in1,$in1,$tmp 274 vext.8 $tmp,$zero,$tmp,#12 275 veor $in1,$in1,$tmp 276 vext.8 $tmp,$zero,$tmp,#12 277 veor $in1,$in1,$tmp 278 279 veor $in1,$in1,$key 280 b .Loop256 281 282.Ldone: 283 str $rounds,[$out] 284 mov $ptr,#0 285 286.Lenc_key_abort: 287 mov x0,$ptr // return value 288 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 289 ret 290.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 291 292.globl ${prefix}_set_decrypt_key 293.type ${prefix}_set_decrypt_key,%function 294.align 5 295${prefix}_set_decrypt_key: 296___ 297$code.=<<___ if ($flavour =~ /64/); 298 .inst 0xd503233f // paciasp 299 stp x29,x30,[sp,#-16]! 300 add x29,sp,#0 301___ 302$code.=<<___ if ($flavour !~ /64/); 303 stmdb sp!,{r4,lr} 304___ 305$code.=<<___; 306 bl .Lenc_key 307 308 cmp x0,#0 309 b.ne .Ldec_key_abort 310 311 sub $out,$out,#240 // restore original $out 312 mov x4,#-16 313 add $inp,$out,x12,lsl#4 // end of key schedule 314 315 vld1.32 {v0.16b},[$out] 316 vld1.32 {v1.16b},[$inp] 317 vst1.32 {v0.16b},[$inp],x4 318 vst1.32 {v1.16b},[$out],#16 319 320.Loop_imc: 321 vld1.32 {v0.16b},[$out] 322 vld1.32 {v1.16b},[$inp] 323 aesimc v0.16b,v0.16b 324 aesimc v1.16b,v1.16b 325 vst1.32 {v0.16b},[$inp],x4 326 vst1.32 {v1.16b},[$out],#16 327 cmp $inp,$out 328 b.hi .Loop_imc 329 330 vld1.32 {v0.16b},[$out] 331 aesimc v0.16b,v0.16b 332 vst1.32 {v0.16b},[$inp] 333 334 eor x0,x0,x0 // return value 335.Ldec_key_abort: 336___ 337$code.=<<___ if ($flavour !~ /64/); 338 ldmia sp!,{r4,pc} 339___ 340$code.=<<___ if ($flavour =~ /64/); 341 ldp x29,x30,[sp],#16 342 .inst 0xd50323bf // autiasp 343 ret 344___ 345$code.=<<___; 346.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 347___ 348}}} 349{{{ 350sub gen_block () { 351my $dir = shift; 352my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 353my ($inp,$out,$key)=map("x$_",(0..2)); 354my $rounds="w3"; 355my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 356 357$code.=<<___; 358.globl ${prefix}_${dir}crypt 359.type ${prefix}_${dir}crypt,%function 360.align 5 361${prefix}_${dir}crypt: 362 ldr $rounds,[$key,#240] 363 vld1.32 {$rndkey0},[$key],#16 364 vld1.8 {$inout},[$inp] 365 sub $rounds,$rounds,#2 366 vld1.32 {$rndkey1},[$key],#16 367 368.Loop_${dir}c: 369 aes$e $inout,$rndkey0 370 aes$mc $inout,$inout 371 vld1.32 {$rndkey0},[$key],#16 372 subs $rounds,$rounds,#2 373 aes$e $inout,$rndkey1 374 aes$mc $inout,$inout 375 vld1.32 {$rndkey1},[$key],#16 376 b.gt .Loop_${dir}c 377 378 aes$e $inout,$rndkey0 379 aes$mc $inout,$inout 380 vld1.32 {$rndkey0},[$key] 381 aes$e $inout,$rndkey1 382 veor $inout,$inout,$rndkey0 383 384 vst1.8 {$inout},[$out] 385 ret 386.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 387___ 388} 389&gen_block("en"); 390&gen_block("de"); 391}}} 392 393# Performance in cycles per byte. 394# Processed with AES-ECB different key size. 395# It shows the value before and after optimization as below: 396# (before/after): 397# 398# AES-128-ECB AES-192-ECB AES-256-ECB 399# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 400# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 401 402# Optimization is implemented by loop unrolling and interleaving. 403# Commonly, we choose the unrolling factor as 5, if the input 404# data size smaller than 5 blocks, but not smaller than 3 blocks, 405# choose 3 as the unrolling factor. 406# If the input data size dsize >= 5*16 bytes, then take 5 blocks 407# as one iteration, every loop the left size lsize -= 5*16. 408# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, 409# every loop lsize -=3*16. 410# If lsize < 3*16 bytes, treat them as the tail, interleave the 411# two blocks AES instructions. 412# There is one special case, if the original input data size dsize 413# = 16 bytes, we will treat it seperately to improve the 414# performance: one independent code block without LR, FP load and 415# store, just looks like what the original ECB implementation does. 416 417{{{ 418my ($inp,$out,$len,$key)=map("x$_",(0..3)); 419my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); 420my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 421 422my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 423 424### q7 last round key 425### q10-q15 q7 Last 7 round keys 426### q8-q9 preloaded round keys except last 7 keys for big size 427### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte 428 429{ 430my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 431 432my ($dat3,$in3,$tmp3); # used only in 64-bit mode 433my ($dat4,$in4,$tmp4); 434if ($flavour =~ /64/) { 435 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 436} 437 438$code.=<<___; 439.globl ${prefix}_ecb_encrypt 440.type ${prefix}_ecb_encrypt,%function 441.align 5 442${prefix}_ecb_encrypt: 443___ 444$code.=<<___ if ($flavour =~ /64/); 445 subs $len,$len,#16 446 // Original input data size bigger than 16, jump to big size processing. 447 b.ne .Lecb_big_size 448 vld1.8 {$dat0},[$inp] 449 cmp $enc,#0 // en- or decrypting? 450 ldr $rounds,[$key,#240] 451 vld1.32 {q5-q6},[$key],#32 // load key schedule... 452 453 b.eq .Lecb_small_dec 454 aese $dat0,q5 455 aesmc $dat0,$dat0 456 vld1.32 {q8-q9},[$key],#32 // load key schedule... 457 aese $dat0,q6 458 aesmc $dat0,$dat0 459 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing 460 b.eq .Lecb_128_enc 461.Lecb_round_loop: 462 aese $dat0,q8 463 aesmc $dat0,$dat0 464 vld1.32 {q8},[$key],#16 // load key schedule... 465 aese $dat0,q9 466 aesmc $dat0,$dat0 467 vld1.32 {q9},[$key],#16 // load key schedule... 468 subs $rounds,$rounds,#2 // bias 469 b.gt .Lecb_round_loop 470.Lecb_128_enc: 471 vld1.32 {q10-q11},[$key],#32 // load key schedule... 472 aese $dat0,q8 473 aesmc $dat0,$dat0 474 aese $dat0,q9 475 aesmc $dat0,$dat0 476 vld1.32 {q12-q13},[$key],#32 // load key schedule... 477 aese $dat0,q10 478 aesmc $dat0,$dat0 479 aese $dat0,q11 480 aesmc $dat0,$dat0 481 vld1.32 {q14-q15},[$key],#32 // load key schedule... 482 aese $dat0,q12 483 aesmc $dat0,$dat0 484 aese $dat0,q13 485 aesmc $dat0,$dat0 486 vld1.32 {$rndlast},[$key] 487 aese $dat0,q14 488 aesmc $dat0,$dat0 489 aese $dat0,q15 490 veor $dat0,$dat0,$rndlast 491 vst1.8 {$dat0},[$out] 492 b .Lecb_Final_abort 493.Lecb_small_dec: 494 aesd $dat0,q5 495 aesimc $dat0,$dat0 496 vld1.32 {q8-q9},[$key],#32 // load key schedule... 497 aesd $dat0,q6 498 aesimc $dat0,$dat0 499 subs $rounds,$rounds,#10 // bias 500 b.eq .Lecb_128_dec 501.Lecb_dec_round_loop: 502 aesd $dat0,q8 503 aesimc $dat0,$dat0 504 vld1.32 {q8},[$key],#16 // load key schedule... 505 aesd $dat0,q9 506 aesimc $dat0,$dat0 507 vld1.32 {q9},[$key],#16 // load key schedule... 508 subs $rounds,$rounds,#2 // bias 509 b.gt .Lecb_dec_round_loop 510.Lecb_128_dec: 511 vld1.32 {q10-q11},[$key],#32 // load key schedule... 512 aesd $dat0,q8 513 aesimc $dat0,$dat0 514 aesd $dat0,q9 515 aesimc $dat0,$dat0 516 vld1.32 {q12-q13},[$key],#32 // load key schedule... 517 aesd $dat0,q10 518 aesimc $dat0,$dat0 519 aesd $dat0,q11 520 aesimc $dat0,$dat0 521 vld1.32 {q14-q15},[$key],#32 // load key schedule... 522 aesd $dat0,q12 523 aesimc $dat0,$dat0 524 aesd $dat0,q13 525 aesimc $dat0,$dat0 526 vld1.32 {$rndlast},[$key] 527 aesd $dat0,q14 528 aesimc $dat0,$dat0 529 aesd $dat0,q15 530 veor $dat0,$dat0,$rndlast 531 vst1.8 {$dat0},[$out] 532 b .Lecb_Final_abort 533.Lecb_big_size: 534___ 535$code.=<<___ if ($flavour =~ /64/); 536 stp x29,x30,[sp,#-16]! 537 add x29,sp,#0 538___ 539$code.=<<___ if ($flavour !~ /64/); 540 mov ip,sp 541 stmdb sp!,{r4-r8,lr} 542 vstmdb sp!,{d8-d15} @ ABI specification says so 543 ldmia ip,{r4-r5} @ load remaining args 544 subs $len,$len,#16 545___ 546$code.=<<___; 547 mov $step,#16 548 b.lo .Lecb_done 549 cclr $step,eq 550 551 cmp $enc,#0 // en- or decrypting? 552 ldr $rounds,[$key,#240] 553 and $len,$len,#-16 554 vld1.8 {$dat},[$inp],$step 555 556 vld1.32 {q8-q9},[$key] // load key schedule... 557 sub $rounds,$rounds,#6 558 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 559 sub $rounds,$rounds,#2 560 vld1.32 {q10-q11},[$key_],#32 561 vld1.32 {q12-q13},[$key_],#32 562 vld1.32 {q14-q15},[$key_],#32 563 vld1.32 {$rndlast},[$key_] 564 565 add $key_,$key,#32 566 mov $cnt,$rounds 567 b.eq .Lecb_dec 568 569 vld1.8 {$dat1},[$inp],#16 570 subs $len,$len,#32 // bias 571 add $cnt,$rounds,#2 572 vorr $in1,$dat1,$dat1 573 vorr $dat2,$dat1,$dat1 574 vorr $dat1,$dat,$dat 575 b.lo .Lecb_enc_tail 576 577 vorr $dat1,$in1,$in1 578 vld1.8 {$dat2},[$inp],#16 579___ 580$code.=<<___ if ($flavour =~ /64/); 581 cmp $len,#32 582 b.lo .Loop3x_ecb_enc 583 584 vld1.8 {$dat3},[$inp],#16 585 vld1.8 {$dat4},[$inp],#16 586 sub $len,$len,#32 // bias 587 mov $cnt,$rounds 588 589.Loop5x_ecb_enc: 590 aese $dat0,q8 591 aesmc $dat0,$dat0 592 aese $dat1,q8 593 aesmc $dat1,$dat1 594 aese $dat2,q8 595 aesmc $dat2,$dat2 596 aese $dat3,q8 597 aesmc $dat3,$dat3 598 aese $dat4,q8 599 aesmc $dat4,$dat4 600 vld1.32 {q8},[$key_],#16 601 subs $cnt,$cnt,#2 602 aese $dat0,q9 603 aesmc $dat0,$dat0 604 aese $dat1,q9 605 aesmc $dat1,$dat1 606 aese $dat2,q9 607 aesmc $dat2,$dat2 608 aese $dat3,q9 609 aesmc $dat3,$dat3 610 aese $dat4,q9 611 aesmc $dat4,$dat4 612 vld1.32 {q9},[$key_],#16 613 b.gt .Loop5x_ecb_enc 614 615 aese $dat0,q8 616 aesmc $dat0,$dat0 617 aese $dat1,q8 618 aesmc $dat1,$dat1 619 aese $dat2,q8 620 aesmc $dat2,$dat2 621 aese $dat3,q8 622 aesmc $dat3,$dat3 623 aese $dat4,q8 624 aesmc $dat4,$dat4 625 cmp $len,#0x40 // because .Lecb_enc_tail4x 626 sub $len,$len,#0x50 627 628 aese $dat0,q9 629 aesmc $dat0,$dat0 630 aese $dat1,q9 631 aesmc $dat1,$dat1 632 aese $dat2,q9 633 aesmc $dat2,$dat2 634 aese $dat3,q9 635 aesmc $dat3,$dat3 636 aese $dat4,q9 637 aesmc $dat4,$dat4 638 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 639 mov $key_,$key 640 641 aese $dat0,q10 642 aesmc $dat0,$dat0 643 aese $dat1,q10 644 aesmc $dat1,$dat1 645 aese $dat2,q10 646 aesmc $dat2,$dat2 647 aese $dat3,q10 648 aesmc $dat3,$dat3 649 aese $dat4,q10 650 aesmc $dat4,$dat4 651 add $inp,$inp,x6 // $inp is adjusted in such way that 652 // at exit from the loop $dat1-$dat4 653 // are loaded with last "words" 654 add x6,$len,#0x60 // because .Lecb_enc_tail4x 655 656 aese $dat0,q11 657 aesmc $dat0,$dat0 658 aese $dat1,q11 659 aesmc $dat1,$dat1 660 aese $dat2,q11 661 aesmc $dat2,$dat2 662 aese $dat3,q11 663 aesmc $dat3,$dat3 664 aese $dat4,q11 665 aesmc $dat4,$dat4 666 667 aese $dat0,q12 668 aesmc $dat0,$dat0 669 aese $dat1,q12 670 aesmc $dat1,$dat1 671 aese $dat2,q12 672 aesmc $dat2,$dat2 673 aese $dat3,q12 674 aesmc $dat3,$dat3 675 aese $dat4,q12 676 aesmc $dat4,$dat4 677 678 aese $dat0,q13 679 aesmc $dat0,$dat0 680 aese $dat1,q13 681 aesmc $dat1,$dat1 682 aese $dat2,q13 683 aesmc $dat2,$dat2 684 aese $dat3,q13 685 aesmc $dat3,$dat3 686 aese $dat4,q13 687 aesmc $dat4,$dat4 688 689 aese $dat0,q14 690 aesmc $dat0,$dat0 691 aese $dat1,q14 692 aesmc $dat1,$dat1 693 aese $dat2,q14 694 aesmc $dat2,$dat2 695 aese $dat3,q14 696 aesmc $dat3,$dat3 697 aese $dat4,q14 698 aesmc $dat4,$dat4 699 700 aese $dat0,q15 701 vld1.8 {$in0},[$inp],#16 702 aese $dat1,q15 703 vld1.8 {$in1},[$inp],#16 704 aese $dat2,q15 705 vld1.8 {$in2},[$inp],#16 706 aese $dat3,q15 707 vld1.8 {$in3},[$inp],#16 708 aese $dat4,q15 709 vld1.8 {$in4},[$inp],#16 710 cbz x6,.Lecb_enc_tail4x 711 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 712 veor $tmp0,$rndlast,$dat0 713 vorr $dat0,$in0,$in0 714 veor $tmp1,$rndlast,$dat1 715 vorr $dat1,$in1,$in1 716 veor $tmp2,$rndlast,$dat2 717 vorr $dat2,$in2,$in2 718 veor $tmp3,$rndlast,$dat3 719 vorr $dat3,$in3,$in3 720 veor $tmp4,$rndlast,$dat4 721 vst1.8 {$tmp0},[$out],#16 722 vorr $dat4,$in4,$in4 723 vst1.8 {$tmp1},[$out],#16 724 mov $cnt,$rounds 725 vst1.8 {$tmp2},[$out],#16 726 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 727 vst1.8 {$tmp3},[$out],#16 728 vst1.8 {$tmp4},[$out],#16 729 b.hs .Loop5x_ecb_enc 730 731 add $len,$len,#0x50 732 cbz $len,.Lecb_done 733 734 add $cnt,$rounds,#2 735 subs $len,$len,#0x30 736 vorr $dat0,$in2,$in2 737 vorr $dat1,$in3,$in3 738 vorr $dat2,$in4,$in4 739 b.lo .Lecb_enc_tail 740 741 b .Loop3x_ecb_enc 742 743.align 4 744.Lecb_enc_tail4x: 745 veor $tmp1,$rndlast,$dat1 746 veor $tmp2,$rndlast,$dat2 747 veor $tmp3,$rndlast,$dat3 748 veor $tmp4,$rndlast,$dat4 749 vst1.8 {$tmp1},[$out],#16 750 vst1.8 {$tmp2},[$out],#16 751 vst1.8 {$tmp3},[$out],#16 752 vst1.8 {$tmp4},[$out],#16 753 754 b .Lecb_done 755.align 4 756___ 757$code.=<<___; 758.Loop3x_ecb_enc: 759 aese $dat0,q8 760 aesmc $dat0,$dat0 761 aese $dat1,q8 762 aesmc $dat1,$dat1 763 aese $dat2,q8 764 aesmc $dat2,$dat2 765 vld1.32 {q8},[$key_],#16 766 subs $cnt,$cnt,#2 767 aese $dat0,q9 768 aesmc $dat0,$dat0 769 aese $dat1,q9 770 aesmc $dat1,$dat1 771 aese $dat2,q9 772 aesmc $dat2,$dat2 773 vld1.32 {q9},[$key_],#16 774 b.gt .Loop3x_ecb_enc 775 776 aese $dat0,q8 777 aesmc $dat0,$dat0 778 aese $dat1,q8 779 aesmc $dat1,$dat1 780 aese $dat2,q8 781 aesmc $dat2,$dat2 782 subs $len,$len,#0x30 783 mov.lo x6,$len // x6, $cnt, is zero at this point 784 aese $dat0,q9 785 aesmc $dat0,$dat0 786 aese $dat1,q9 787 aesmc $dat1,$dat1 788 aese $dat2,q9 789 aesmc $dat2,$dat2 790 add $inp,$inp,x6 // $inp is adjusted in such way that 791 // at exit from the loop $dat1-$dat2 792 // are loaded with last "words" 793 mov $key_,$key 794 aese $dat0,q12 795 aesmc $dat0,$dat0 796 aese $dat1,q12 797 aesmc $dat1,$dat1 798 aese $dat2,q12 799 aesmc $dat2,$dat2 800 vld1.8 {$in0},[$inp],#16 801 aese $dat0,q13 802 aesmc $dat0,$dat0 803 aese $dat1,q13 804 aesmc $dat1,$dat1 805 aese $dat2,q13 806 aesmc $dat2,$dat2 807 vld1.8 {$in1},[$inp],#16 808 aese $dat0,q14 809 aesmc $dat0,$dat0 810 aese $dat1,q14 811 aesmc $dat1,$dat1 812 aese $dat2,q14 813 aesmc $dat2,$dat2 814 vld1.8 {$in2},[$inp],#16 815 aese $dat0,q15 816 aese $dat1,q15 817 aese $dat2,q15 818 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 819 add $cnt,$rounds,#2 820 veor $tmp0,$rndlast,$dat0 821 veor $tmp1,$rndlast,$dat1 822 veor $dat2,$dat2,$rndlast 823 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 824 vst1.8 {$tmp0},[$out],#16 825 vorr $dat0,$in0,$in0 826 vst1.8 {$tmp1},[$out],#16 827 vorr $dat1,$in1,$in1 828 vst1.8 {$dat2},[$out],#16 829 vorr $dat2,$in2,$in2 830 b.hs .Loop3x_ecb_enc 831 832 cmn $len,#0x30 833 b.eq .Lecb_done 834 nop 835 836.Lecb_enc_tail: 837 aese $dat1,q8 838 aesmc $dat1,$dat1 839 aese $dat2,q8 840 aesmc $dat2,$dat2 841 vld1.32 {q8},[$key_],#16 842 subs $cnt,$cnt,#2 843 aese $dat1,q9 844 aesmc $dat1,$dat1 845 aese $dat2,q9 846 aesmc $dat2,$dat2 847 vld1.32 {q9},[$key_],#16 848 b.gt .Lecb_enc_tail 849 850 aese $dat1,q8 851 aesmc $dat1,$dat1 852 aese $dat2,q8 853 aesmc $dat2,$dat2 854 aese $dat1,q9 855 aesmc $dat1,$dat1 856 aese $dat2,q9 857 aesmc $dat2,$dat2 858 aese $dat1,q12 859 aesmc $dat1,$dat1 860 aese $dat2,q12 861 aesmc $dat2,$dat2 862 cmn $len,#0x20 863 aese $dat1,q13 864 aesmc $dat1,$dat1 865 aese $dat2,q13 866 aesmc $dat2,$dat2 867 aese $dat1,q14 868 aesmc $dat1,$dat1 869 aese $dat2,q14 870 aesmc $dat2,$dat2 871 aese $dat1,q15 872 aese $dat2,q15 873 b.eq .Lecb_enc_one 874 veor $tmp1,$rndlast,$dat1 875 veor $tmp2,$rndlast,$dat2 876 vst1.8 {$tmp1},[$out],#16 877 vst1.8 {$tmp2},[$out],#16 878 b .Lecb_done 879 880.Lecb_enc_one: 881 veor $tmp1,$rndlast,$dat2 882 vst1.8 {$tmp1},[$out],#16 883 b .Lecb_done 884___ 885 886$code.=<<___; 887.align 5 888.Lecb_dec: 889 vld1.8 {$dat1},[$inp],#16 890 subs $len,$len,#32 // bias 891 add $cnt,$rounds,#2 892 vorr $in1,$dat1,$dat1 893 vorr $dat2,$dat1,$dat1 894 vorr $dat1,$dat,$dat 895 b.lo .Lecb_dec_tail 896 897 vorr $dat1,$in1,$in1 898 vld1.8 {$dat2},[$inp],#16 899___ 900$code.=<<___ if ($flavour =~ /64/); 901 cmp $len,#32 902 b.lo .Loop3x_ecb_dec 903 904 vld1.8 {$dat3},[$inp],#16 905 vld1.8 {$dat4},[$inp],#16 906 sub $len,$len,#32 // bias 907 mov $cnt,$rounds 908 909.Loop5x_ecb_dec: 910 aesd $dat0,q8 911 aesimc $dat0,$dat0 912 aesd $dat1,q8 913 aesimc $dat1,$dat1 914 aesd $dat2,q8 915 aesimc $dat2,$dat2 916 aesd $dat3,q8 917 aesimc $dat3,$dat3 918 aesd $dat4,q8 919 aesimc $dat4,$dat4 920 vld1.32 {q8},[$key_],#16 921 subs $cnt,$cnt,#2 922 aesd $dat0,q9 923 aesimc $dat0,$dat0 924 aesd $dat1,q9 925 aesimc $dat1,$dat1 926 aesd $dat2,q9 927 aesimc $dat2,$dat2 928 aesd $dat3,q9 929 aesimc $dat3,$dat3 930 aesd $dat4,q9 931 aesimc $dat4,$dat4 932 vld1.32 {q9},[$key_],#16 933 b.gt .Loop5x_ecb_dec 934 935 aesd $dat0,q8 936 aesimc $dat0,$dat0 937 aesd $dat1,q8 938 aesimc $dat1,$dat1 939 aesd $dat2,q8 940 aesimc $dat2,$dat2 941 aesd $dat3,q8 942 aesimc $dat3,$dat3 943 aesd $dat4,q8 944 aesimc $dat4,$dat4 945 cmp $len,#0x40 // because .Lecb_tail4x 946 sub $len,$len,#0x50 947 948 aesd $dat0,q9 949 aesimc $dat0,$dat0 950 aesd $dat1,q9 951 aesimc $dat1,$dat1 952 aesd $dat2,q9 953 aesimc $dat2,$dat2 954 aesd $dat3,q9 955 aesimc $dat3,$dat3 956 aesd $dat4,q9 957 aesimc $dat4,$dat4 958 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 959 mov $key_,$key 960 961 aesd $dat0,q10 962 aesimc $dat0,$dat0 963 aesd $dat1,q10 964 aesimc $dat1,$dat1 965 aesd $dat2,q10 966 aesimc $dat2,$dat2 967 aesd $dat3,q10 968 aesimc $dat3,$dat3 969 aesd $dat4,q10 970 aesimc $dat4,$dat4 971 add $inp,$inp,x6 // $inp is adjusted in such way that 972 // at exit from the loop $dat1-$dat4 973 // are loaded with last "words" 974 add x6,$len,#0x60 // because .Lecb_tail4x 975 976 aesd $dat0,q11 977 aesimc $dat0,$dat0 978 aesd $dat1,q11 979 aesimc $dat1,$dat1 980 aesd $dat2,q11 981 aesimc $dat2,$dat2 982 aesd $dat3,q11 983 aesimc $dat3,$dat3 984 aesd $dat4,q11 985 aesimc $dat4,$dat4 986 987 aesd $dat0,q12 988 aesimc $dat0,$dat0 989 aesd $dat1,q12 990 aesimc $dat1,$dat1 991 aesd $dat2,q12 992 aesimc $dat2,$dat2 993 aesd $dat3,q12 994 aesimc $dat3,$dat3 995 aesd $dat4,q12 996 aesimc $dat4,$dat4 997 998 aesd $dat0,q13 999 aesimc $dat0,$dat0 1000 aesd $dat1,q13 1001 aesimc $dat1,$dat1 1002 aesd $dat2,q13 1003 aesimc $dat2,$dat2 1004 aesd $dat3,q13 1005 aesimc $dat3,$dat3 1006 aesd $dat4,q13 1007 aesimc $dat4,$dat4 1008 1009 aesd $dat0,q14 1010 aesimc $dat0,$dat0 1011 aesd $dat1,q14 1012 aesimc $dat1,$dat1 1013 aesd $dat2,q14 1014 aesimc $dat2,$dat2 1015 aesd $dat3,q14 1016 aesimc $dat3,$dat3 1017 aesd $dat4,q14 1018 aesimc $dat4,$dat4 1019 1020 aesd $dat0,q15 1021 vld1.8 {$in0},[$inp],#16 1022 aesd $dat1,q15 1023 vld1.8 {$in1},[$inp],#16 1024 aesd $dat2,q15 1025 vld1.8 {$in2},[$inp],#16 1026 aesd $dat3,q15 1027 vld1.8 {$in3},[$inp],#16 1028 aesd $dat4,q15 1029 vld1.8 {$in4},[$inp],#16 1030 cbz x6,.Lecb_tail4x 1031 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1032 veor $tmp0,$rndlast,$dat0 1033 vorr $dat0,$in0,$in0 1034 veor $tmp1,$rndlast,$dat1 1035 vorr $dat1,$in1,$in1 1036 veor $tmp2,$rndlast,$dat2 1037 vorr $dat2,$in2,$in2 1038 veor $tmp3,$rndlast,$dat3 1039 vorr $dat3,$in3,$in3 1040 veor $tmp4,$rndlast,$dat4 1041 vst1.8 {$tmp0},[$out],#16 1042 vorr $dat4,$in4,$in4 1043 vst1.8 {$tmp1},[$out],#16 1044 mov $cnt,$rounds 1045 vst1.8 {$tmp2},[$out],#16 1046 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1047 vst1.8 {$tmp3},[$out],#16 1048 vst1.8 {$tmp4},[$out],#16 1049 b.hs .Loop5x_ecb_dec 1050 1051 add $len,$len,#0x50 1052 cbz $len,.Lecb_done 1053 1054 add $cnt,$rounds,#2 1055 subs $len,$len,#0x30 1056 vorr $dat0,$in2,$in2 1057 vorr $dat1,$in3,$in3 1058 vorr $dat2,$in4,$in4 1059 b.lo .Lecb_dec_tail 1060 1061 b .Loop3x_ecb_dec 1062 1063.align 4 1064.Lecb_tail4x: 1065 veor $tmp1,$rndlast,$dat1 1066 veor $tmp2,$rndlast,$dat2 1067 veor $tmp3,$rndlast,$dat3 1068 veor $tmp4,$rndlast,$dat4 1069 vst1.8 {$tmp1},[$out],#16 1070 vst1.8 {$tmp2},[$out],#16 1071 vst1.8 {$tmp3},[$out],#16 1072 vst1.8 {$tmp4},[$out],#16 1073 1074 b .Lecb_done 1075.align 4 1076___ 1077$code.=<<___; 1078.Loop3x_ecb_dec: 1079 aesd $dat0,q8 1080 aesimc $dat0,$dat0 1081 aesd $dat1,q8 1082 aesimc $dat1,$dat1 1083 aesd $dat2,q8 1084 aesimc $dat2,$dat2 1085 vld1.32 {q8},[$key_],#16 1086 subs $cnt,$cnt,#2 1087 aesd $dat0,q9 1088 aesimc $dat0,$dat0 1089 aesd $dat1,q9 1090 aesimc $dat1,$dat1 1091 aesd $dat2,q9 1092 aesimc $dat2,$dat2 1093 vld1.32 {q9},[$key_],#16 1094 b.gt .Loop3x_ecb_dec 1095 1096 aesd $dat0,q8 1097 aesimc $dat0,$dat0 1098 aesd $dat1,q8 1099 aesimc $dat1,$dat1 1100 aesd $dat2,q8 1101 aesimc $dat2,$dat2 1102 subs $len,$len,#0x30 1103 mov.lo x6,$len // x6, $cnt, is zero at this point 1104 aesd $dat0,q9 1105 aesimc $dat0,$dat0 1106 aesd $dat1,q9 1107 aesimc $dat1,$dat1 1108 aesd $dat2,q9 1109 aesimc $dat2,$dat2 1110 add $inp,$inp,x6 // $inp is adjusted in such way that 1111 // at exit from the loop $dat1-$dat2 1112 // are loaded with last "words" 1113 mov $key_,$key 1114 aesd $dat0,q12 1115 aesimc $dat0,$dat0 1116 aesd $dat1,q12 1117 aesimc $dat1,$dat1 1118 aesd $dat2,q12 1119 aesimc $dat2,$dat2 1120 vld1.8 {$in0},[$inp],#16 1121 aesd $dat0,q13 1122 aesimc $dat0,$dat0 1123 aesd $dat1,q13 1124 aesimc $dat1,$dat1 1125 aesd $dat2,q13 1126 aesimc $dat2,$dat2 1127 vld1.8 {$in1},[$inp],#16 1128 aesd $dat0,q14 1129 aesimc $dat0,$dat0 1130 aesd $dat1,q14 1131 aesimc $dat1,$dat1 1132 aesd $dat2,q14 1133 aesimc $dat2,$dat2 1134 vld1.8 {$in2},[$inp],#16 1135 aesd $dat0,q15 1136 aesd $dat1,q15 1137 aesd $dat2,q15 1138 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1139 add $cnt,$rounds,#2 1140 veor $tmp0,$rndlast,$dat0 1141 veor $tmp1,$rndlast,$dat1 1142 veor $dat2,$dat2,$rndlast 1143 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1144 vst1.8 {$tmp0},[$out],#16 1145 vorr $dat0,$in0,$in0 1146 vst1.8 {$tmp1},[$out],#16 1147 vorr $dat1,$in1,$in1 1148 vst1.8 {$dat2},[$out],#16 1149 vorr $dat2,$in2,$in2 1150 b.hs .Loop3x_ecb_dec 1151 1152 cmn $len,#0x30 1153 b.eq .Lecb_done 1154 nop 1155 1156.Lecb_dec_tail: 1157 aesd $dat1,q8 1158 aesimc $dat1,$dat1 1159 aesd $dat2,q8 1160 aesimc $dat2,$dat2 1161 vld1.32 {q8},[$key_],#16 1162 subs $cnt,$cnt,#2 1163 aesd $dat1,q9 1164 aesimc $dat1,$dat1 1165 aesd $dat2,q9 1166 aesimc $dat2,$dat2 1167 vld1.32 {q9},[$key_],#16 1168 b.gt .Lecb_dec_tail 1169 1170 aesd $dat1,q8 1171 aesimc $dat1,$dat1 1172 aesd $dat2,q8 1173 aesimc $dat2,$dat2 1174 aesd $dat1,q9 1175 aesimc $dat1,$dat1 1176 aesd $dat2,q9 1177 aesimc $dat2,$dat2 1178 aesd $dat1,q12 1179 aesimc $dat1,$dat1 1180 aesd $dat2,q12 1181 aesimc $dat2,$dat2 1182 cmn $len,#0x20 1183 aesd $dat1,q13 1184 aesimc $dat1,$dat1 1185 aesd $dat2,q13 1186 aesimc $dat2,$dat2 1187 aesd $dat1,q14 1188 aesimc $dat1,$dat1 1189 aesd $dat2,q14 1190 aesimc $dat2,$dat2 1191 aesd $dat1,q15 1192 aesd $dat2,q15 1193 b.eq .Lecb_dec_one 1194 veor $tmp1,$rndlast,$dat1 1195 veor $tmp2,$rndlast,$dat2 1196 vst1.8 {$tmp1},[$out],#16 1197 vst1.8 {$tmp2},[$out],#16 1198 b .Lecb_done 1199 1200.Lecb_dec_one: 1201 veor $tmp1,$rndlast,$dat2 1202 vst1.8 {$tmp1},[$out],#16 1203 1204.Lecb_done: 1205___ 1206} 1207$code.=<<___ if ($flavour !~ /64/); 1208 vldmia sp!,{d8-d15} 1209 ldmia sp!,{r4-r8,pc} 1210___ 1211$code.=<<___ if ($flavour =~ /64/); 1212 ldr x29,[sp],#16 1213___ 1214$code.=<<___ if ($flavour =~ /64/); 1215.Lecb_Final_abort: 1216 ret 1217___ 1218$code.=<<___; 1219.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 1220___ 1221}}} 1222{{{ 1223my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 1224my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 1225my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1226 1227my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 1228my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 1229 1230### q8-q15 preloaded key schedule 1231 1232$code.=<<___; 1233.globl ${prefix}_cbc_encrypt 1234.type ${prefix}_cbc_encrypt,%function 1235.align 5 1236${prefix}_cbc_encrypt: 1237___ 1238$code.=<<___ if ($flavour =~ /64/); 1239 stp x29,x30,[sp,#-16]! 1240 add x29,sp,#0 1241___ 1242$code.=<<___ if ($flavour !~ /64/); 1243 mov ip,sp 1244 stmdb sp!,{r4-r8,lr} 1245 vstmdb sp!,{d8-d15} @ ABI specification says so 1246 ldmia ip,{r4-r5} @ load remaining args 1247___ 1248$code.=<<___; 1249 subs $len,$len,#16 1250 mov $step,#16 1251 b.lo .Lcbc_abort 1252 cclr $step,eq 1253 1254 cmp $enc,#0 // en- or decrypting? 1255 ldr $rounds,[$key,#240] 1256 and $len,$len,#-16 1257 vld1.8 {$ivec},[$ivp] 1258 vld1.8 {$dat},[$inp],$step 1259 1260 vld1.32 {q8-q9},[$key] // load key schedule... 1261 sub $rounds,$rounds,#6 1262 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 1263 sub $rounds,$rounds,#2 1264 vld1.32 {q10-q11},[$key_],#32 1265 vld1.32 {q12-q13},[$key_],#32 1266 vld1.32 {q14-q15},[$key_],#32 1267 vld1.32 {$rndlast},[$key_] 1268 1269 add $key_,$key,#32 1270 mov $cnt,$rounds 1271 b.eq .Lcbc_dec 1272 1273 cmp $rounds,#2 1274 veor $dat,$dat,$ivec 1275 veor $rndzero_n_last,q8,$rndlast 1276 b.eq .Lcbc_enc128 1277 1278 vld1.32 {$in0-$in1},[$key_] 1279 add $key_,$key,#16 1280 add $key4,$key,#16*4 1281 add $key5,$key,#16*5 1282 aese $dat,q8 1283 aesmc $dat,$dat 1284 add $key6,$key,#16*6 1285 add $key7,$key,#16*7 1286 b .Lenter_cbc_enc 1287 1288.align 4 1289.Loop_cbc_enc: 1290 aese $dat,q8 1291 aesmc $dat,$dat 1292 vst1.8 {$ivec},[$out],#16 1293.Lenter_cbc_enc: 1294 aese $dat,q9 1295 aesmc $dat,$dat 1296 aese $dat,$in0 1297 aesmc $dat,$dat 1298 vld1.32 {q8},[$key4] 1299 cmp $rounds,#4 1300 aese $dat,$in1 1301 aesmc $dat,$dat 1302 vld1.32 {q9},[$key5] 1303 b.eq .Lcbc_enc192 1304 1305 aese $dat,q8 1306 aesmc $dat,$dat 1307 vld1.32 {q8},[$key6] 1308 aese $dat,q9 1309 aesmc $dat,$dat 1310 vld1.32 {q9},[$key7] 1311 nop 1312 1313.Lcbc_enc192: 1314 aese $dat,q8 1315 aesmc $dat,$dat 1316 subs $len,$len,#16 1317 aese $dat,q9 1318 aesmc $dat,$dat 1319 cclr $step,eq 1320 aese $dat,q10 1321 aesmc $dat,$dat 1322 aese $dat,q11 1323 aesmc $dat,$dat 1324 vld1.8 {q8},[$inp],$step 1325 aese $dat,q12 1326 aesmc $dat,$dat 1327 veor q8,q8,$rndzero_n_last 1328 aese $dat,q13 1329 aesmc $dat,$dat 1330 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 1331 aese $dat,q14 1332 aesmc $dat,$dat 1333 aese $dat,q15 1334 veor $ivec,$dat,$rndlast 1335 b.hs .Loop_cbc_enc 1336 1337 vst1.8 {$ivec},[$out],#16 1338 b .Lcbc_done 1339 1340.align 5 1341.Lcbc_enc128: 1342 vld1.32 {$in0-$in1},[$key_] 1343 aese $dat,q8 1344 aesmc $dat,$dat 1345 b .Lenter_cbc_enc128 1346.Loop_cbc_enc128: 1347 aese $dat,q8 1348 aesmc $dat,$dat 1349 vst1.8 {$ivec},[$out],#16 1350.Lenter_cbc_enc128: 1351 aese $dat,q9 1352 aesmc $dat,$dat 1353 subs $len,$len,#16 1354 aese $dat,$in0 1355 aesmc $dat,$dat 1356 cclr $step,eq 1357 aese $dat,$in1 1358 aesmc $dat,$dat 1359 aese $dat,q10 1360 aesmc $dat,$dat 1361 aese $dat,q11 1362 aesmc $dat,$dat 1363 vld1.8 {q8},[$inp],$step 1364 aese $dat,q12 1365 aesmc $dat,$dat 1366 aese $dat,q13 1367 aesmc $dat,$dat 1368 aese $dat,q14 1369 aesmc $dat,$dat 1370 veor q8,q8,$rndzero_n_last 1371 aese $dat,q15 1372 veor $ivec,$dat,$rndlast 1373 b.hs .Loop_cbc_enc128 1374 1375 vst1.8 {$ivec},[$out],#16 1376 b .Lcbc_done 1377___ 1378{ 1379my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1380 1381my ($dat3,$in3,$tmp3); # used only in 64-bit mode 1382my ($dat4,$in4,$tmp4); 1383if ($flavour =~ /64/) { 1384 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 1385} 1386 1387$code.=<<___; 1388.align 5 1389.Lcbc_dec: 1390 vld1.8 {$dat2},[$inp],#16 1391 subs $len,$len,#32 // bias 1392 add $cnt,$rounds,#2 1393 vorr $in1,$dat,$dat 1394 vorr $dat1,$dat,$dat 1395 vorr $in2,$dat2,$dat2 1396 b.lo .Lcbc_dec_tail 1397 1398 vorr $dat1,$dat2,$dat2 1399 vld1.8 {$dat2},[$inp],#16 1400 vorr $in0,$dat,$dat 1401 vorr $in1,$dat1,$dat1 1402 vorr $in2,$dat2,$dat2 1403___ 1404$code.=<<___ if ($flavour =~ /64/); 1405 cmp $len,#32 1406 b.lo .Loop3x_cbc_dec 1407 1408 vld1.8 {$dat3},[$inp],#16 1409 vld1.8 {$dat4},[$inp],#16 1410 sub $len,$len,#32 // bias 1411 mov $cnt,$rounds 1412 vorr $in3,$dat3,$dat3 1413 vorr $in4,$dat4,$dat4 1414 1415.Loop5x_cbc_dec: 1416 aesd $dat0,q8 1417 aesimc $dat0,$dat0 1418 aesd $dat1,q8 1419 aesimc $dat1,$dat1 1420 aesd $dat2,q8 1421 aesimc $dat2,$dat2 1422 aesd $dat3,q8 1423 aesimc $dat3,$dat3 1424 aesd $dat4,q8 1425 aesimc $dat4,$dat4 1426 vld1.32 {q8},[$key_],#16 1427 subs $cnt,$cnt,#2 1428 aesd $dat0,q9 1429 aesimc $dat0,$dat0 1430 aesd $dat1,q9 1431 aesimc $dat1,$dat1 1432 aesd $dat2,q9 1433 aesimc $dat2,$dat2 1434 aesd $dat3,q9 1435 aesimc $dat3,$dat3 1436 aesd $dat4,q9 1437 aesimc $dat4,$dat4 1438 vld1.32 {q9},[$key_],#16 1439 b.gt .Loop5x_cbc_dec 1440 1441 aesd $dat0,q8 1442 aesimc $dat0,$dat0 1443 aesd $dat1,q8 1444 aesimc $dat1,$dat1 1445 aesd $dat2,q8 1446 aesimc $dat2,$dat2 1447 aesd $dat3,q8 1448 aesimc $dat3,$dat3 1449 aesd $dat4,q8 1450 aesimc $dat4,$dat4 1451 cmp $len,#0x40 // because .Lcbc_tail4x 1452 sub $len,$len,#0x50 1453 1454 aesd $dat0,q9 1455 aesimc $dat0,$dat0 1456 aesd $dat1,q9 1457 aesimc $dat1,$dat1 1458 aesd $dat2,q9 1459 aesimc $dat2,$dat2 1460 aesd $dat3,q9 1461 aesimc $dat3,$dat3 1462 aesd $dat4,q9 1463 aesimc $dat4,$dat4 1464 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 1465 mov $key_,$key 1466 1467 aesd $dat0,q10 1468 aesimc $dat0,$dat0 1469 aesd $dat1,q10 1470 aesimc $dat1,$dat1 1471 aesd $dat2,q10 1472 aesimc $dat2,$dat2 1473 aesd $dat3,q10 1474 aesimc $dat3,$dat3 1475 aesd $dat4,q10 1476 aesimc $dat4,$dat4 1477 add $inp,$inp,x6 // $inp is adjusted in such way that 1478 // at exit from the loop $dat1-$dat4 1479 // are loaded with last "words" 1480 add x6,$len,#0x60 // because .Lcbc_tail4x 1481 1482 aesd $dat0,q11 1483 aesimc $dat0,$dat0 1484 aesd $dat1,q11 1485 aesimc $dat1,$dat1 1486 aesd $dat2,q11 1487 aesimc $dat2,$dat2 1488 aesd $dat3,q11 1489 aesimc $dat3,$dat3 1490 aesd $dat4,q11 1491 aesimc $dat4,$dat4 1492 1493 aesd $dat0,q12 1494 aesimc $dat0,$dat0 1495 aesd $dat1,q12 1496 aesimc $dat1,$dat1 1497 aesd $dat2,q12 1498 aesimc $dat2,$dat2 1499 aesd $dat3,q12 1500 aesimc $dat3,$dat3 1501 aesd $dat4,q12 1502 aesimc $dat4,$dat4 1503 1504 aesd $dat0,q13 1505 aesimc $dat0,$dat0 1506 aesd $dat1,q13 1507 aesimc $dat1,$dat1 1508 aesd $dat2,q13 1509 aesimc $dat2,$dat2 1510 aesd $dat3,q13 1511 aesimc $dat3,$dat3 1512 aesd $dat4,q13 1513 aesimc $dat4,$dat4 1514 1515 aesd $dat0,q14 1516 aesimc $dat0,$dat0 1517 aesd $dat1,q14 1518 aesimc $dat1,$dat1 1519 aesd $dat2,q14 1520 aesimc $dat2,$dat2 1521 aesd $dat3,q14 1522 aesimc $dat3,$dat3 1523 aesd $dat4,q14 1524 aesimc $dat4,$dat4 1525 1526 veor $tmp0,$ivec,$rndlast 1527 aesd $dat0,q15 1528 veor $tmp1,$in0,$rndlast 1529 vld1.8 {$in0},[$inp],#16 1530 aesd $dat1,q15 1531 veor $tmp2,$in1,$rndlast 1532 vld1.8 {$in1},[$inp],#16 1533 aesd $dat2,q15 1534 veor $tmp3,$in2,$rndlast 1535 vld1.8 {$in2},[$inp],#16 1536 aesd $dat3,q15 1537 veor $tmp4,$in3,$rndlast 1538 vld1.8 {$in3},[$inp],#16 1539 aesd $dat4,q15 1540 vorr $ivec,$in4,$in4 1541 vld1.8 {$in4},[$inp],#16 1542 cbz x6,.Lcbc_tail4x 1543 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1544 veor $tmp0,$tmp0,$dat0 1545 vorr $dat0,$in0,$in0 1546 veor $tmp1,$tmp1,$dat1 1547 vorr $dat1,$in1,$in1 1548 veor $tmp2,$tmp2,$dat2 1549 vorr $dat2,$in2,$in2 1550 veor $tmp3,$tmp3,$dat3 1551 vorr $dat3,$in3,$in3 1552 veor $tmp4,$tmp4,$dat4 1553 vst1.8 {$tmp0},[$out],#16 1554 vorr $dat4,$in4,$in4 1555 vst1.8 {$tmp1},[$out],#16 1556 mov $cnt,$rounds 1557 vst1.8 {$tmp2},[$out],#16 1558 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1559 vst1.8 {$tmp3},[$out],#16 1560 vst1.8 {$tmp4},[$out],#16 1561 b.hs .Loop5x_cbc_dec 1562 1563 add $len,$len,#0x50 1564 cbz $len,.Lcbc_done 1565 1566 add $cnt,$rounds,#2 1567 subs $len,$len,#0x30 1568 vorr $dat0,$in2,$in2 1569 vorr $in0,$in2,$in2 1570 vorr $dat1,$in3,$in3 1571 vorr $in1,$in3,$in3 1572 vorr $dat2,$in4,$in4 1573 vorr $in2,$in4,$in4 1574 b.lo .Lcbc_dec_tail 1575 1576 b .Loop3x_cbc_dec 1577 1578.align 4 1579.Lcbc_tail4x: 1580 veor $tmp1,$tmp0,$dat1 1581 veor $tmp2,$tmp2,$dat2 1582 veor $tmp3,$tmp3,$dat3 1583 veor $tmp4,$tmp4,$dat4 1584 vst1.8 {$tmp1},[$out],#16 1585 vst1.8 {$tmp2},[$out],#16 1586 vst1.8 {$tmp3},[$out],#16 1587 vst1.8 {$tmp4},[$out],#16 1588 1589 b .Lcbc_done 1590.align 4 1591___ 1592$code.=<<___; 1593.Loop3x_cbc_dec: 1594 aesd $dat0,q8 1595 aesimc $dat0,$dat0 1596 aesd $dat1,q8 1597 aesimc $dat1,$dat1 1598 aesd $dat2,q8 1599 aesimc $dat2,$dat2 1600 vld1.32 {q8},[$key_],#16 1601 subs $cnt,$cnt,#2 1602 aesd $dat0,q9 1603 aesimc $dat0,$dat0 1604 aesd $dat1,q9 1605 aesimc $dat1,$dat1 1606 aesd $dat2,q9 1607 aesimc $dat2,$dat2 1608 vld1.32 {q9},[$key_],#16 1609 b.gt .Loop3x_cbc_dec 1610 1611 aesd $dat0,q8 1612 aesimc $dat0,$dat0 1613 aesd $dat1,q8 1614 aesimc $dat1,$dat1 1615 aesd $dat2,q8 1616 aesimc $dat2,$dat2 1617 veor $tmp0,$ivec,$rndlast 1618 subs $len,$len,#0x30 1619 veor $tmp1,$in0,$rndlast 1620 mov.lo x6,$len // x6, $cnt, is zero at this point 1621 aesd $dat0,q9 1622 aesimc $dat0,$dat0 1623 aesd $dat1,q9 1624 aesimc $dat1,$dat1 1625 aesd $dat2,q9 1626 aesimc $dat2,$dat2 1627 veor $tmp2,$in1,$rndlast 1628 add $inp,$inp,x6 // $inp is adjusted in such way that 1629 // at exit from the loop $dat1-$dat2 1630 // are loaded with last "words" 1631 vorr $ivec,$in2,$in2 1632 mov $key_,$key 1633 aesd $dat0,q12 1634 aesimc $dat0,$dat0 1635 aesd $dat1,q12 1636 aesimc $dat1,$dat1 1637 aesd $dat2,q12 1638 aesimc $dat2,$dat2 1639 vld1.8 {$in0},[$inp],#16 1640 aesd $dat0,q13 1641 aesimc $dat0,$dat0 1642 aesd $dat1,q13 1643 aesimc $dat1,$dat1 1644 aesd $dat2,q13 1645 aesimc $dat2,$dat2 1646 vld1.8 {$in1},[$inp],#16 1647 aesd $dat0,q14 1648 aesimc $dat0,$dat0 1649 aesd $dat1,q14 1650 aesimc $dat1,$dat1 1651 aesd $dat2,q14 1652 aesimc $dat2,$dat2 1653 vld1.8 {$in2},[$inp],#16 1654 aesd $dat0,q15 1655 aesd $dat1,q15 1656 aesd $dat2,q15 1657 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1658 add $cnt,$rounds,#2 1659 veor $tmp0,$tmp0,$dat0 1660 veor $tmp1,$tmp1,$dat1 1661 veor $dat2,$dat2,$tmp2 1662 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1663 vst1.8 {$tmp0},[$out],#16 1664 vorr $dat0,$in0,$in0 1665 vst1.8 {$tmp1},[$out],#16 1666 vorr $dat1,$in1,$in1 1667 vst1.8 {$dat2},[$out],#16 1668 vorr $dat2,$in2,$in2 1669 b.hs .Loop3x_cbc_dec 1670 1671 cmn $len,#0x30 1672 b.eq .Lcbc_done 1673 nop 1674 1675.Lcbc_dec_tail: 1676 aesd $dat1,q8 1677 aesimc $dat1,$dat1 1678 aesd $dat2,q8 1679 aesimc $dat2,$dat2 1680 vld1.32 {q8},[$key_],#16 1681 subs $cnt,$cnt,#2 1682 aesd $dat1,q9 1683 aesimc $dat1,$dat1 1684 aesd $dat2,q9 1685 aesimc $dat2,$dat2 1686 vld1.32 {q9},[$key_],#16 1687 b.gt .Lcbc_dec_tail 1688 1689 aesd $dat1,q8 1690 aesimc $dat1,$dat1 1691 aesd $dat2,q8 1692 aesimc $dat2,$dat2 1693 aesd $dat1,q9 1694 aesimc $dat1,$dat1 1695 aesd $dat2,q9 1696 aesimc $dat2,$dat2 1697 aesd $dat1,q12 1698 aesimc $dat1,$dat1 1699 aesd $dat2,q12 1700 aesimc $dat2,$dat2 1701 cmn $len,#0x20 1702 aesd $dat1,q13 1703 aesimc $dat1,$dat1 1704 aesd $dat2,q13 1705 aesimc $dat2,$dat2 1706 veor $tmp1,$ivec,$rndlast 1707 aesd $dat1,q14 1708 aesimc $dat1,$dat1 1709 aesd $dat2,q14 1710 aesimc $dat2,$dat2 1711 veor $tmp2,$in1,$rndlast 1712 aesd $dat1,q15 1713 aesd $dat2,q15 1714 b.eq .Lcbc_dec_one 1715 veor $tmp1,$tmp1,$dat1 1716 veor $tmp2,$tmp2,$dat2 1717 vorr $ivec,$in2,$in2 1718 vst1.8 {$tmp1},[$out],#16 1719 vst1.8 {$tmp2},[$out],#16 1720 b .Lcbc_done 1721 1722.Lcbc_dec_one: 1723 veor $tmp1,$tmp1,$dat2 1724 vorr $ivec,$in2,$in2 1725 vst1.8 {$tmp1},[$out],#16 1726 1727.Lcbc_done: 1728 vst1.8 {$ivec},[$ivp] 1729.Lcbc_abort: 1730___ 1731} 1732$code.=<<___ if ($flavour !~ /64/); 1733 vldmia sp!,{d8-d15} 1734 ldmia sp!,{r4-r8,pc} 1735___ 1736$code.=<<___ if ($flavour =~ /64/); 1737 ldr x29,[sp],#16 1738 ret 1739___ 1740$code.=<<___; 1741.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1742___ 1743}}} 1744{{{ 1745my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 1746my ($rounds,$cnt,$key_)=("w5","w6","x7"); 1747my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 1748my $step="x12"; # aliases with $tctr2 1749 1750my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1751my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1752 1753# used only in 64-bit mode... 1754my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); 1755 1756my ($dat,$tmp)=($dat0,$tmp0); 1757 1758### q8-q15 preloaded key schedule 1759 1760$code.=<<___; 1761.globl ${prefix}_ctr32_encrypt_blocks 1762.type ${prefix}_ctr32_encrypt_blocks,%function 1763.align 5 1764${prefix}_ctr32_encrypt_blocks: 1765___ 1766$code.=<<___ if ($flavour =~ /64/); 1767 stp x29,x30,[sp,#-16]! 1768 add x29,sp,#0 1769___ 1770$code.=<<___ if ($flavour !~ /64/); 1771 mov ip,sp 1772 stmdb sp!,{r4-r10,lr} 1773 vstmdb sp!,{d8-d15} @ ABI specification says so 1774 ldr r4, [ip] @ load remaining arg 1775___ 1776$code.=<<___; 1777 ldr $rounds,[$key,#240] 1778 1779 ldr $ctr, [$ivp, #12] 1780#ifdef __ARMEB__ 1781 vld1.8 {$dat0},[$ivp] 1782#else 1783 vld1.32 {$dat0},[$ivp] 1784#endif 1785 vld1.32 {q8-q9},[$key] // load key schedule... 1786 sub $rounds,$rounds,#4 1787 mov $step,#16 1788 cmp $len,#2 1789 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 1790 sub $rounds,$rounds,#2 1791 vld1.32 {q12-q13},[$key_],#32 1792 vld1.32 {q14-q15},[$key_],#32 1793 vld1.32 {$rndlast},[$key_] 1794 add $key_,$key,#32 1795 mov $cnt,$rounds 1796 cclr $step,lo 1797#ifndef __ARMEB__ 1798 rev $ctr, $ctr 1799#endif 1800___ 1801$code.=<<___ if ($flavour =~ /64/); 1802 vorr $dat1,$dat0,$dat0 1803 add $tctr1, $ctr, #1 1804 vorr $dat2,$dat0,$dat0 1805 add $ctr, $ctr, #2 1806 vorr $ivec,$dat0,$dat0 1807 rev $tctr1, $tctr1 1808 vmov.32 ${dat1}[3],$tctr1 1809 b.ls .Lctr32_tail 1810 rev $tctr2, $ctr 1811 sub $len,$len,#3 // bias 1812 vmov.32 ${dat2}[3],$tctr2 1813___ 1814$code.=<<___ if ($flavour !~ /64/); 1815 add $tctr1, $ctr, #1 1816 vorr $ivec,$dat0,$dat0 1817 rev $tctr1, $tctr1 1818 vmov.32 ${ivec}[3],$tctr1 1819 add $ctr, $ctr, #2 1820 vorr $dat1,$ivec,$ivec 1821 b.ls .Lctr32_tail 1822 rev $tctr2, $ctr 1823 vmov.32 ${ivec}[3],$tctr2 1824 sub $len,$len,#3 // bias 1825 vorr $dat2,$ivec,$ivec 1826___ 1827$code.=<<___ if ($flavour =~ /64/); 1828 cmp $len,#32 1829 b.lo .Loop3x_ctr32 1830 1831 add w13,$ctr,#1 1832 add w14,$ctr,#2 1833 vorr $dat3,$dat0,$dat0 1834 rev w13,w13 1835 vorr $dat4,$dat0,$dat0 1836 rev w14,w14 1837 vmov.32 ${dat3}[3],w13 1838 sub $len,$len,#2 // bias 1839 vmov.32 ${dat4}[3],w14 1840 add $ctr,$ctr,#2 1841 b .Loop5x_ctr32 1842 1843.align 4 1844.Loop5x_ctr32: 1845 aese $dat0,q8 1846 aesmc $dat0,$dat0 1847 aese $dat1,q8 1848 aesmc $dat1,$dat1 1849 aese $dat2,q8 1850 aesmc $dat2,$dat2 1851 aese $dat3,q8 1852 aesmc $dat3,$dat3 1853 aese $dat4,q8 1854 aesmc $dat4,$dat4 1855 vld1.32 {q8},[$key_],#16 1856 subs $cnt,$cnt,#2 1857 aese $dat0,q9 1858 aesmc $dat0,$dat0 1859 aese $dat1,q9 1860 aesmc $dat1,$dat1 1861 aese $dat2,q9 1862 aesmc $dat2,$dat2 1863 aese $dat3,q9 1864 aesmc $dat3,$dat3 1865 aese $dat4,q9 1866 aesmc $dat4,$dat4 1867 vld1.32 {q9},[$key_],#16 1868 b.gt .Loop5x_ctr32 1869 1870 mov $key_,$key 1871 aese $dat0,q8 1872 aesmc $dat0,$dat0 1873 aese $dat1,q8 1874 aesmc $dat1,$dat1 1875 aese $dat2,q8 1876 aesmc $dat2,$dat2 1877 aese $dat3,q8 1878 aesmc $dat3,$dat3 1879 aese $dat4,q8 1880 aesmc $dat4,$dat4 1881 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1882 1883 aese $dat0,q9 1884 aesmc $dat0,$dat0 1885 aese $dat1,q9 1886 aesmc $dat1,$dat1 1887 aese $dat2,q9 1888 aesmc $dat2,$dat2 1889 aese $dat3,q9 1890 aesmc $dat3,$dat3 1891 aese $dat4,q9 1892 aesmc $dat4,$dat4 1893 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1894 1895 aese $dat0,q12 1896 aesmc $dat0,$dat0 1897 add $tctr0,$ctr,#1 1898 add $tctr1,$ctr,#2 1899 aese $dat1,q12 1900 aesmc $dat1,$dat1 1901 add $tctr2,$ctr,#3 1902 add w13,$ctr,#4 1903 aese $dat2,q12 1904 aesmc $dat2,$dat2 1905 add w14,$ctr,#5 1906 rev $tctr0,$tctr0 1907 aese $dat3,q12 1908 aesmc $dat3,$dat3 1909 rev $tctr1,$tctr1 1910 rev $tctr2,$tctr2 1911 aese $dat4,q12 1912 aesmc $dat4,$dat4 1913 rev w13,w13 1914 rev w14,w14 1915 1916 aese $dat0,q13 1917 aesmc $dat0,$dat0 1918 aese $dat1,q13 1919 aesmc $dat1,$dat1 1920 aese $dat2,q13 1921 aesmc $dat2,$dat2 1922 aese $dat3,q13 1923 aesmc $dat3,$dat3 1924 aese $dat4,q13 1925 aesmc $dat4,$dat4 1926 1927 aese $dat0,q14 1928 aesmc $dat0,$dat0 1929 vld1.8 {$in0},[$inp],#16 1930 aese $dat1,q14 1931 aesmc $dat1,$dat1 1932 vld1.8 {$in1},[$inp],#16 1933 aese $dat2,q14 1934 aesmc $dat2,$dat2 1935 vld1.8 {$in2},[$inp],#16 1936 aese $dat3,q14 1937 aesmc $dat3,$dat3 1938 vld1.8 {$in3},[$inp],#16 1939 aese $dat4,q14 1940 aesmc $dat4,$dat4 1941 vld1.8 {$in4},[$inp],#16 1942 1943 aese $dat0,q15 1944 veor $in0,$in0,$rndlast 1945 aese $dat1,q15 1946 veor $in1,$in1,$rndlast 1947 aese $dat2,q15 1948 veor $in2,$in2,$rndlast 1949 aese $dat3,q15 1950 veor $in3,$in3,$rndlast 1951 aese $dat4,q15 1952 veor $in4,$in4,$rndlast 1953 1954 veor $in0,$in0,$dat0 1955 vorr $dat0,$ivec,$ivec 1956 veor $in1,$in1,$dat1 1957 vorr $dat1,$ivec,$ivec 1958 veor $in2,$in2,$dat2 1959 vorr $dat2,$ivec,$ivec 1960 veor $in3,$in3,$dat3 1961 vorr $dat3,$ivec,$ivec 1962 veor $in4,$in4,$dat4 1963 vorr $dat4,$ivec,$ivec 1964 1965 vst1.8 {$in0},[$out],#16 1966 vmov.32 ${dat0}[3],$tctr0 1967 vst1.8 {$in1},[$out],#16 1968 vmov.32 ${dat1}[3],$tctr1 1969 vst1.8 {$in2},[$out],#16 1970 vmov.32 ${dat2}[3],$tctr2 1971 vst1.8 {$in3},[$out],#16 1972 vmov.32 ${dat3}[3],w13 1973 vst1.8 {$in4},[$out],#16 1974 vmov.32 ${dat4}[3],w14 1975 1976 mov $cnt,$rounds 1977 cbz $len,.Lctr32_done 1978 1979 add $ctr,$ctr,#5 1980 subs $len,$len,#5 1981 b.hs .Loop5x_ctr32 1982 1983 add $len,$len,#5 1984 sub $ctr,$ctr,#5 1985 1986 cmp $len,#2 1987 mov $step,#16 1988 cclr $step,lo 1989 b.ls .Lctr32_tail 1990 1991 sub $len,$len,#3 // bias 1992 add $ctr,$ctr,#3 1993___ 1994$code.=<<___; 1995 b .Loop3x_ctr32 1996 1997.align 4 1998.Loop3x_ctr32: 1999 aese $dat0,q8 2000 aesmc $dat0,$dat0 2001 aese $dat1,q8 2002 aesmc $dat1,$dat1 2003 aese $dat2,q8 2004 aesmc $dat2,$dat2 2005 vld1.32 {q8},[$key_],#16 2006 subs $cnt,$cnt,#2 2007 aese $dat0,q9 2008 aesmc $dat0,$dat0 2009 aese $dat1,q9 2010 aesmc $dat1,$dat1 2011 aese $dat2,q9 2012 aesmc $dat2,$dat2 2013 vld1.32 {q9},[$key_],#16 2014 b.gt .Loop3x_ctr32 2015 2016 aese $dat0,q8 2017 aesmc $tmp0,$dat0 2018 aese $dat1,q8 2019 aesmc $tmp1,$dat1 2020 vld1.8 {$in0},[$inp],#16 2021___ 2022$code.=<<___ if ($flavour =~ /64/); 2023 vorr $dat0,$ivec,$ivec 2024___ 2025$code.=<<___ if ($flavour !~ /64/); 2026 add $tctr0,$ctr,#1 2027___ 2028$code.=<<___; 2029 aese $dat2,q8 2030 aesmc $dat2,$dat2 2031 vld1.8 {$in1},[$inp],#16 2032___ 2033$code.=<<___ if ($flavour =~ /64/); 2034 vorr $dat1,$ivec,$ivec 2035___ 2036$code.=<<___ if ($flavour !~ /64/); 2037 rev $tctr0,$tctr0 2038___ 2039$code.=<<___; 2040 aese $tmp0,q9 2041 aesmc $tmp0,$tmp0 2042 aese $tmp1,q9 2043 aesmc $tmp1,$tmp1 2044 vld1.8 {$in2},[$inp],#16 2045 mov $key_,$key 2046 aese $dat2,q9 2047 aesmc $tmp2,$dat2 2048___ 2049$code.=<<___ if ($flavour =~ /64/); 2050 vorr $dat2,$ivec,$ivec 2051 add $tctr0,$ctr,#1 2052___ 2053$code.=<<___; 2054 aese $tmp0,q12 2055 aesmc $tmp0,$tmp0 2056 aese $tmp1,q12 2057 aesmc $tmp1,$tmp1 2058 veor $in0,$in0,$rndlast 2059 add $tctr1,$ctr,#2 2060 aese $tmp2,q12 2061 aesmc $tmp2,$tmp2 2062 veor $in1,$in1,$rndlast 2063 add $ctr,$ctr,#3 2064 aese $tmp0,q13 2065 aesmc $tmp0,$tmp0 2066 aese $tmp1,q13 2067 aesmc $tmp1,$tmp1 2068 veor $in2,$in2,$rndlast 2069___ 2070$code.=<<___ if ($flavour =~ /64/); 2071 rev $tctr0,$tctr0 2072 aese $tmp2,q13 2073 aesmc $tmp2,$tmp2 2074 vmov.32 ${dat0}[3], $tctr0 2075___ 2076$code.=<<___ if ($flavour !~ /64/); 2077 vmov.32 ${ivec}[3], $tctr0 2078 aese $tmp2,q13 2079 aesmc $tmp2,$tmp2 2080 vorr $dat0,$ivec,$ivec 2081___ 2082$code.=<<___; 2083 rev $tctr1,$tctr1 2084 aese $tmp0,q14 2085 aesmc $tmp0,$tmp0 2086___ 2087$code.=<<___ if ($flavour !~ /64/); 2088 vmov.32 ${ivec}[3], $tctr1 2089 rev $tctr2,$ctr 2090___ 2091$code.=<<___; 2092 aese $tmp1,q14 2093 aesmc $tmp1,$tmp1 2094___ 2095$code.=<<___ if ($flavour =~ /64/); 2096 vmov.32 ${dat1}[3], $tctr1 2097 rev $tctr2,$ctr 2098 aese $tmp2,q14 2099 aesmc $tmp2,$tmp2 2100 vmov.32 ${dat2}[3], $tctr2 2101___ 2102$code.=<<___ if ($flavour !~ /64/); 2103 vorr $dat1,$ivec,$ivec 2104 vmov.32 ${ivec}[3], $tctr2 2105 aese $tmp2,q14 2106 aesmc $tmp2,$tmp2 2107 vorr $dat2,$ivec,$ivec 2108___ 2109$code.=<<___; 2110 subs $len,$len,#3 2111 aese $tmp0,q15 2112 aese $tmp1,q15 2113 aese $tmp2,q15 2114 2115 veor $in0,$in0,$tmp0 2116 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2117 vst1.8 {$in0},[$out],#16 2118 veor $in1,$in1,$tmp1 2119 mov $cnt,$rounds 2120 vst1.8 {$in1},[$out],#16 2121 veor $in2,$in2,$tmp2 2122 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2123 vst1.8 {$in2},[$out],#16 2124 b.hs .Loop3x_ctr32 2125 2126 adds $len,$len,#3 2127 b.eq .Lctr32_done 2128 cmp $len,#1 2129 mov $step,#16 2130 cclr $step,eq 2131 2132.Lctr32_tail: 2133 aese $dat0,q8 2134 aesmc $dat0,$dat0 2135 aese $dat1,q8 2136 aesmc $dat1,$dat1 2137 vld1.32 {q8},[$key_],#16 2138 subs $cnt,$cnt,#2 2139 aese $dat0,q9 2140 aesmc $dat0,$dat0 2141 aese $dat1,q9 2142 aesmc $dat1,$dat1 2143 vld1.32 {q9},[$key_],#16 2144 b.gt .Lctr32_tail 2145 2146 aese $dat0,q8 2147 aesmc $dat0,$dat0 2148 aese $dat1,q8 2149 aesmc $dat1,$dat1 2150 aese $dat0,q9 2151 aesmc $dat0,$dat0 2152 aese $dat1,q9 2153 aesmc $dat1,$dat1 2154 vld1.8 {$in0},[$inp],$step 2155 aese $dat0,q12 2156 aesmc $dat0,$dat0 2157 aese $dat1,q12 2158 aesmc $dat1,$dat1 2159 vld1.8 {$in1},[$inp] 2160 aese $dat0,q13 2161 aesmc $dat0,$dat0 2162 aese $dat1,q13 2163 aesmc $dat1,$dat1 2164 veor $in0,$in0,$rndlast 2165 aese $dat0,q14 2166 aesmc $dat0,$dat0 2167 aese $dat1,q14 2168 aesmc $dat1,$dat1 2169 veor $in1,$in1,$rndlast 2170 aese $dat0,q15 2171 aese $dat1,q15 2172 2173 cmp $len,#1 2174 veor $in0,$in0,$dat0 2175 veor $in1,$in1,$dat1 2176 vst1.8 {$in0},[$out],#16 2177 b.eq .Lctr32_done 2178 vst1.8 {$in1},[$out] 2179 2180.Lctr32_done: 2181___ 2182$code.=<<___ if ($flavour !~ /64/); 2183 vldmia sp!,{d8-d15} 2184 ldmia sp!,{r4-r10,pc} 2185___ 2186$code.=<<___ if ($flavour =~ /64/); 2187 ldr x29,[sp],#16 2188 ret 2189___ 2190$code.=<<___; 2191.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 2192___ 2193}}} 2194# Performance in cycles per byte. 2195# Processed with AES-XTS different key size. 2196# It shows the value before and after optimization as below: 2197# (before/after): 2198# 2199# AES-128-XTS AES-256-XTS 2200# Cortex-A57 3.36/1.09 4.02/1.37 2201# Cortex-A72 3.03/1.02 3.28/1.33 2202 2203# Optimization is implemented by loop unrolling and interleaving. 2204# Commonly, we choose the unrolling factor as 5, if the input 2205# data size smaller than 5 blocks, but not smaller than 3 blocks, 2206# choose 3 as the unrolling factor. 2207# If the input data size dsize >= 5*16 bytes, then take 5 blocks 2208# as one iteration, every loop the left size lsize -= 5*16. 2209# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes 2210# will be processed specially, which be integrated into the 5*16 bytes 2211# loop to improve the efficiency. 2212# There is one special case, if the original input data size dsize 2213# = 16 bytes, we will treat it seperately to improve the 2214# performance: one independent code block without LR, FP load and 2215# store. 2216# Encryption will process the (length -tailcnt) bytes as mentioned 2217# previously, then encrypt the composite block as last second 2218# cipher block. 2219# Decryption will process the (length -tailcnt -1) bytes as mentioned 2220# previously, then decrypt the last second cipher block to get the 2221# last plain block(tail), decrypt the composite block as last second 2222# plain text block. 2223 2224{{{ 2225my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2226my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2227my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2228my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2229my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2230my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2231my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); 2232my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2233my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2234 2235my ($tmpin)=("v26.16b"); 2236my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2237 2238# q7 last round key 2239# q10-q15, q7 Last 7 round keys 2240# q8-q9 preloaded round keys except last 7 keys for big size 2241# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 2242 2243 2244my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2245 2246my ($dat3,$in3,$tmp3); # used only in 64-bit mode 2247my ($dat4,$in4,$tmp4); 2248if ($flavour =~ /64/) { 2249 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 2250} 2251 2252$code.=<<___ if ($flavour =~ /64/); 2253.globl ${prefix}_xts_encrypt 2254.type ${prefix}_xts_encrypt,%function 2255.align 5 2256${prefix}_xts_encrypt: 2257___ 2258$code.=<<___ if ($flavour =~ /64/); 2259 cmp $len,#16 2260 // Original input data size bigger than 16, jump to big size processing. 2261 b.ne .Lxts_enc_big_size 2262 // Encrypt the iv with key2, as the first XEX iv. 2263 ldr $rounds,[$key2,#240] 2264 vld1.32 {$dat},[$key2],#16 2265 vld1.8 {$iv0},[$ivp] 2266 sub $rounds,$rounds,#2 2267 vld1.32 {$dat1},[$key2],#16 2268 2269.Loop_enc_iv_enc: 2270 aese $iv0,$dat 2271 aesmc $iv0,$iv0 2272 vld1.32 {$dat},[$key2],#16 2273 subs $rounds,$rounds,#2 2274 aese $iv0,$dat1 2275 aesmc $iv0,$iv0 2276 vld1.32 {$dat1},[$key2],#16 2277 b.gt .Loop_enc_iv_enc 2278 2279 aese $iv0,$dat 2280 aesmc $iv0,$iv0 2281 vld1.32 {$dat},[$key2] 2282 aese $iv0,$dat1 2283 veor $iv0,$iv0,$dat 2284 2285 vld1.8 {$dat0},[$inp] 2286 veor $dat0,$iv0,$dat0 2287 2288 ldr $rounds,[$key1,#240] 2289 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 2290 2291 aese $dat0,q20 2292 aesmc $dat0,$dat0 2293 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 2294 aese $dat0,q21 2295 aesmc $dat0,$dat0 2296 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing 2297 b.eq .Lxts_128_enc 2298.Lxts_enc_round_loop: 2299 aese $dat0,q8 2300 aesmc $dat0,$dat0 2301 vld1.32 {q8},[$key1],#16 // load key schedule... 2302 aese $dat0,q9 2303 aesmc $dat0,$dat0 2304 vld1.32 {q9},[$key1],#16 // load key schedule... 2305 subs $rounds,$rounds,#2 // bias 2306 b.gt .Lxts_enc_round_loop 2307.Lxts_128_enc: 2308 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 2309 aese $dat0,q8 2310 aesmc $dat0,$dat0 2311 aese $dat0,q9 2312 aesmc $dat0,$dat0 2313 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 2314 aese $dat0,q10 2315 aesmc $dat0,$dat0 2316 aese $dat0,q11 2317 aesmc $dat0,$dat0 2318 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 2319 aese $dat0,q12 2320 aesmc $dat0,$dat0 2321 aese $dat0,q13 2322 aesmc $dat0,$dat0 2323 vld1.32 {$rndlast},[$key1] 2324 aese $dat0,q14 2325 aesmc $dat0,$dat0 2326 aese $dat0,q15 2327 veor $dat0,$dat0,$rndlast 2328 veor $dat0,$dat0,$iv0 2329 vst1.8 {$dat0},[$out] 2330 b .Lxts_enc_final_abort 2331 2332.align 4 2333.Lxts_enc_big_size: 2334___ 2335$code.=<<___ if ($flavour =~ /64/); 2336 stp $constnumx,$tmpinp,[sp,#-64]! 2337 stp $tailcnt,$midnumx,[sp,#48] 2338 stp $ivd10,$ivd20,[sp,#32] 2339 stp $ivd30,$ivd40,[sp,#16] 2340 2341 // tailcnt store the tail value of length%16. 2342 and $tailcnt,$len,#0xf 2343 and $len,$len,#-16 2344 subs $len,$len,#16 2345 mov $step,#16 2346 b.lo .Lxts_abort 2347 csel $step,xzr,$step,eq 2348 2349 // Firstly, encrypt the iv with key2, as the first iv of XEX. 2350 ldr $rounds,[$key2,#240] 2351 vld1.32 {$dat},[$key2],#16 2352 vld1.8 {$iv0},[$ivp] 2353 sub $rounds,$rounds,#2 2354 vld1.32 {$dat1},[$key2],#16 2355 2356.Loop_iv_enc: 2357 aese $iv0,$dat 2358 aesmc $iv0,$iv0 2359 vld1.32 {$dat},[$key2],#16 2360 subs $rounds,$rounds,#2 2361 aese $iv0,$dat1 2362 aesmc $iv0,$iv0 2363 vld1.32 {$dat1},[$key2],#16 2364 b.gt .Loop_iv_enc 2365 2366 aese $iv0,$dat 2367 aesmc $iv0,$iv0 2368 vld1.32 {$dat},[$key2] 2369 aese $iv0,$dat1 2370 veor $iv0,$iv0,$dat 2371 2372 // The iv for second block 2373 // $ivl- iv(low), $ivh - iv(high) 2374 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 2375 fmov $ivl,$ivd00 2376 fmov $ivh,$ivd01 2377 mov $constnum,#0x87 2378 extr $midnumx,$ivh,$ivh,#32 2379 extr $ivh,$ivh,$ivl,#63 2380 and $tmpmw,$constnum,$midnum,asr#31 2381 eor $ivl,$tmpmx,$ivl,lsl#1 2382 fmov $ivd10,$ivl 2383 fmov $ivd11,$ivh 2384 2385 ldr $rounds0,[$key1,#240] // next starting point 2386 vld1.8 {$dat},[$inp],$step 2387 2388 vld1.32 {q8-q9},[$key1] // load key schedule... 2389 sub $rounds0,$rounds0,#6 2390 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 2391 sub $rounds0,$rounds0,#2 2392 vld1.32 {q10-q11},[$key_],#32 2393 vld1.32 {q12-q13},[$key_],#32 2394 vld1.32 {q14-q15},[$key_],#32 2395 vld1.32 {$rndlast},[$key_] 2396 2397 add $key_,$key1,#32 2398 mov $rounds,$rounds0 2399 2400 // Encryption 2401.Lxts_enc: 2402 vld1.8 {$dat2},[$inp],#16 2403 subs $len,$len,#32 // bias 2404 add $rounds,$rounds0,#2 2405 vorr $in1,$dat,$dat 2406 vorr $dat1,$dat,$dat 2407 vorr $in3,$dat,$dat 2408 vorr $in2,$dat2,$dat2 2409 vorr $in4,$dat2,$dat2 2410 b.lo .Lxts_inner_enc_tail 2411 veor $dat,$dat,$iv0 // before encryption, xor with iv 2412 veor $dat2,$dat2,$iv1 2413 2414 // The iv for third block 2415 extr $midnumx,$ivh,$ivh,#32 2416 extr $ivh,$ivh,$ivl,#63 2417 and $tmpmw,$constnum,$midnum,asr#31 2418 eor $ivl,$tmpmx,$ivl,lsl#1 2419 fmov $ivd20,$ivl 2420 fmov $ivd21,$ivh 2421 2422 2423 vorr $dat1,$dat2,$dat2 2424 vld1.8 {$dat2},[$inp],#16 2425 vorr $in0,$dat,$dat 2426 vorr $in1,$dat1,$dat1 2427 veor $in2,$dat2,$iv2 // the third block 2428 veor $dat2,$dat2,$iv2 2429 cmp $len,#32 2430 b.lo .Lxts_outer_enc_tail 2431 2432 // The iv for fourth block 2433 extr $midnumx,$ivh,$ivh,#32 2434 extr $ivh,$ivh,$ivl,#63 2435 and $tmpmw,$constnum,$midnum,asr#31 2436 eor $ivl,$tmpmx,$ivl,lsl#1 2437 fmov $ivd30,$ivl 2438 fmov $ivd31,$ivh 2439 2440 vld1.8 {$dat3},[$inp],#16 2441 // The iv for fifth block 2442 extr $midnumx,$ivh,$ivh,#32 2443 extr $ivh,$ivh,$ivl,#63 2444 and $tmpmw,$constnum,$midnum,asr#31 2445 eor $ivl,$tmpmx,$ivl,lsl#1 2446 fmov $ivd40,$ivl 2447 fmov $ivd41,$ivh 2448 2449 vld1.8 {$dat4},[$inp],#16 2450 veor $dat3,$dat3,$iv3 // the fourth block 2451 veor $dat4,$dat4,$iv4 2452 sub $len,$len,#32 // bias 2453 mov $rounds,$rounds0 2454 b .Loop5x_xts_enc 2455 2456.align 4 2457.Loop5x_xts_enc: 2458 aese $dat0,q8 2459 aesmc $dat0,$dat0 2460 aese $dat1,q8 2461 aesmc $dat1,$dat1 2462 aese $dat2,q8 2463 aesmc $dat2,$dat2 2464 aese $dat3,q8 2465 aesmc $dat3,$dat3 2466 aese $dat4,q8 2467 aesmc $dat4,$dat4 2468 vld1.32 {q8},[$key_],#16 2469 subs $rounds,$rounds,#2 2470 aese $dat0,q9 2471 aesmc $dat0,$dat0 2472 aese $dat1,q9 2473 aesmc $dat1,$dat1 2474 aese $dat2,q9 2475 aesmc $dat2,$dat2 2476 aese $dat3,q9 2477 aesmc $dat3,$dat3 2478 aese $dat4,q9 2479 aesmc $dat4,$dat4 2480 vld1.32 {q9},[$key_],#16 2481 b.gt .Loop5x_xts_enc 2482 2483 aese $dat0,q8 2484 aesmc $dat0,$dat0 2485 aese $dat1,q8 2486 aesmc $dat1,$dat1 2487 aese $dat2,q8 2488 aesmc $dat2,$dat2 2489 aese $dat3,q8 2490 aesmc $dat3,$dat3 2491 aese $dat4,q8 2492 aesmc $dat4,$dat4 2493 subs $len,$len,#0x50 // because .Lxts_enc_tail4x 2494 2495 aese $dat0,q9 2496 aesmc $dat0,$dat0 2497 aese $dat1,q9 2498 aesmc $dat1,$dat1 2499 aese $dat2,q9 2500 aesmc $dat2,$dat2 2501 aese $dat3,q9 2502 aesmc $dat3,$dat3 2503 aese $dat4,q9 2504 aesmc $dat4,$dat4 2505 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 2506 mov $key_,$key1 2507 2508 aese $dat0,q10 2509 aesmc $dat0,$dat0 2510 aese $dat1,q10 2511 aesmc $dat1,$dat1 2512 aese $dat2,q10 2513 aesmc $dat2,$dat2 2514 aese $dat3,q10 2515 aesmc $dat3,$dat3 2516 aese $dat4,q10 2517 aesmc $dat4,$dat4 2518 add $inp,$inp,$xoffset // x0 is adjusted in such way that 2519 // at exit from the loop v1.16b-v26.16b 2520 // are loaded with last "words" 2521 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x 2522 2523 aese $dat0,q11 2524 aesmc $dat0,$dat0 2525 aese $dat1,q11 2526 aesmc $dat1,$dat1 2527 aese $dat2,q11 2528 aesmc $dat2,$dat2 2529 aese $dat3,q11 2530 aesmc $dat3,$dat3 2531 aese $dat4,q11 2532 aesmc $dat4,$dat4 2533 2534 aese $dat0,q12 2535 aesmc $dat0,$dat0 2536 aese $dat1,q12 2537 aesmc $dat1,$dat1 2538 aese $dat2,q12 2539 aesmc $dat2,$dat2 2540 aese $dat3,q12 2541 aesmc $dat3,$dat3 2542 aese $dat4,q12 2543 aesmc $dat4,$dat4 2544 2545 aese $dat0,q13 2546 aesmc $dat0,$dat0 2547 aese $dat1,q13 2548 aesmc $dat1,$dat1 2549 aese $dat2,q13 2550 aesmc $dat2,$dat2 2551 aese $dat3,q13 2552 aesmc $dat3,$dat3 2553 aese $dat4,q13 2554 aesmc $dat4,$dat4 2555 2556 aese $dat0,q14 2557 aesmc $dat0,$dat0 2558 aese $dat1,q14 2559 aesmc $dat1,$dat1 2560 aese $dat2,q14 2561 aesmc $dat2,$dat2 2562 aese $dat3,q14 2563 aesmc $dat3,$dat3 2564 aese $dat4,q14 2565 aesmc $dat4,$dat4 2566 2567 veor $tmp0,$rndlast,$iv0 2568 aese $dat0,q15 2569 // The iv for first block of one iteration 2570 extr $midnumx,$ivh,$ivh,#32 2571 extr $ivh,$ivh,$ivl,#63 2572 and $tmpmw,$constnum,$midnum,asr#31 2573 eor $ivl,$tmpmx,$ivl,lsl#1 2574 fmov $ivd00,$ivl 2575 fmov $ivd01,$ivh 2576 veor $tmp1,$rndlast,$iv1 2577 vld1.8 {$in0},[$inp],#16 2578 aese $dat1,q15 2579 // The iv for second block 2580 extr $midnumx,$ivh,$ivh,#32 2581 extr $ivh,$ivh,$ivl,#63 2582 and $tmpmw,$constnum,$midnum,asr#31 2583 eor $ivl,$tmpmx,$ivl,lsl#1 2584 fmov $ivd10,$ivl 2585 fmov $ivd11,$ivh 2586 veor $tmp2,$rndlast,$iv2 2587 vld1.8 {$in1},[$inp],#16 2588 aese $dat2,q15 2589 // The iv for third block 2590 extr $midnumx,$ivh,$ivh,#32 2591 extr $ivh,$ivh,$ivl,#63 2592 and $tmpmw,$constnum,$midnum,asr#31 2593 eor $ivl,$tmpmx,$ivl,lsl#1 2594 fmov $ivd20,$ivl 2595 fmov $ivd21,$ivh 2596 veor $tmp3,$rndlast,$iv3 2597 vld1.8 {$in2},[$inp],#16 2598 aese $dat3,q15 2599 // The iv for fourth block 2600 extr $midnumx,$ivh,$ivh,#32 2601 extr $ivh,$ivh,$ivl,#63 2602 and $tmpmw,$constnum,$midnum,asr#31 2603 eor $ivl,$tmpmx,$ivl,lsl#1 2604 fmov $ivd30,$ivl 2605 fmov $ivd31,$ivh 2606 veor $tmp4,$rndlast,$iv4 2607 vld1.8 {$in3},[$inp],#16 2608 aese $dat4,q15 2609 2610 // The iv for fifth block 2611 extr $midnumx,$ivh,$ivh,#32 2612 extr $ivh,$ivh,$ivl,#63 2613 and $tmpmw,$constnum,$midnum,asr #31 2614 eor $ivl,$tmpmx,$ivl,lsl #1 2615 fmov $ivd40,$ivl 2616 fmov $ivd41,$ivh 2617 2618 vld1.8 {$in4},[$inp],#16 2619 cbz $xoffset,.Lxts_enc_tail4x 2620 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2621 veor $tmp0,$tmp0,$dat0 2622 veor $dat0,$in0,$iv0 2623 veor $tmp1,$tmp1,$dat1 2624 veor $dat1,$in1,$iv1 2625 veor $tmp2,$tmp2,$dat2 2626 veor $dat2,$in2,$iv2 2627 veor $tmp3,$tmp3,$dat3 2628 veor $dat3,$in3,$iv3 2629 veor $tmp4,$tmp4,$dat4 2630 vst1.8 {$tmp0},[$out],#16 2631 veor $dat4,$in4,$iv4 2632 vst1.8 {$tmp1},[$out],#16 2633 mov $rounds,$rounds0 2634 vst1.8 {$tmp2},[$out],#16 2635 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2636 vst1.8 {$tmp3},[$out],#16 2637 vst1.8 {$tmp4},[$out],#16 2638 b.hs .Loop5x_xts_enc 2639 2640 2641 // If left 4 blocks, borrow the five block's processing. 2642 cmn $len,#0x10 2643 b.ne .Loop5x_enc_after 2644 vorr $iv4,$iv3,$iv3 2645 vorr $iv3,$iv2,$iv2 2646 vorr $iv2,$iv1,$iv1 2647 vorr $iv1,$iv0,$iv0 2648 fmov $ivl,$ivd40 2649 fmov $ivh,$ivd41 2650 veor $dat0,$iv0,$in0 2651 veor $dat1,$iv1,$in1 2652 veor $dat2,$in2,$iv2 2653 veor $dat3,$in3,$iv3 2654 veor $dat4,$in4,$iv4 2655 b.eq .Loop5x_xts_enc 2656 2657.Loop5x_enc_after: 2658 add $len,$len,#0x50 2659 cbz $len,.Lxts_enc_done 2660 2661 add $rounds,$rounds0,#2 2662 subs $len,$len,#0x30 2663 b.lo .Lxts_inner_enc_tail 2664 2665 veor $dat0,$iv0,$in2 2666 veor $dat1,$iv1,$in3 2667 veor $dat2,$in4,$iv2 2668 b .Lxts_outer_enc_tail 2669 2670.align 4 2671.Lxts_enc_tail4x: 2672 add $inp,$inp,#16 2673 veor $tmp1,$dat1,$tmp1 2674 vst1.8 {$tmp1},[$out],#16 2675 veor $tmp2,$dat2,$tmp2 2676 vst1.8 {$tmp2},[$out],#16 2677 veor $tmp3,$dat3,$tmp3 2678 veor $tmp4,$dat4,$tmp4 2679 vst1.8 {$tmp3-$tmp4},[$out],#32 2680 2681 b .Lxts_enc_done 2682.align 4 2683.Lxts_outer_enc_tail: 2684 aese $dat0,q8 2685 aesmc $dat0,$dat0 2686 aese $dat1,q8 2687 aesmc $dat1,$dat1 2688 aese $dat2,q8 2689 aesmc $dat2,$dat2 2690 vld1.32 {q8},[$key_],#16 2691 subs $rounds,$rounds,#2 2692 aese $dat0,q9 2693 aesmc $dat0,$dat0 2694 aese $dat1,q9 2695 aesmc $dat1,$dat1 2696 aese $dat2,q9 2697 aesmc $dat2,$dat2 2698 vld1.32 {q9},[$key_],#16 2699 b.gt .Lxts_outer_enc_tail 2700 2701 aese $dat0,q8 2702 aesmc $dat0,$dat0 2703 aese $dat1,q8 2704 aesmc $dat1,$dat1 2705 aese $dat2,q8 2706 aesmc $dat2,$dat2 2707 veor $tmp0,$iv0,$rndlast 2708 subs $len,$len,#0x30 2709 // The iv for first block 2710 fmov $ivl,$ivd20 2711 fmov $ivh,$ivd21 2712 //mov $constnum,#0x87 2713 extr $midnumx,$ivh,$ivh,#32 2714 extr $ivh,$ivh,$ivl,#63 2715 and $tmpmw,$constnum,$midnum,asr#31 2716 eor $ivl,$tmpmx,$ivl,lsl#1 2717 fmov $ivd00,$ivl 2718 fmov $ivd01,$ivh 2719 veor $tmp1,$iv1,$rndlast 2720 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 2721 aese $dat0,q9 2722 aesmc $dat0,$dat0 2723 aese $dat1,q9 2724 aesmc $dat1,$dat1 2725 aese $dat2,q9 2726 aesmc $dat2,$dat2 2727 veor $tmp2,$iv2,$rndlast 2728 2729 add $xoffset,$xoffset,#0x20 2730 add $inp,$inp,$xoffset 2731 mov $key_,$key1 2732 2733 aese $dat0,q12 2734 aesmc $dat0,$dat0 2735 aese $dat1,q12 2736 aesmc $dat1,$dat1 2737 aese $dat2,q12 2738 aesmc $dat2,$dat2 2739 aese $dat0,q13 2740 aesmc $dat0,$dat0 2741 aese $dat1,q13 2742 aesmc $dat1,$dat1 2743 aese $dat2,q13 2744 aesmc $dat2,$dat2 2745 aese $dat0,q14 2746 aesmc $dat0,$dat0 2747 aese $dat1,q14 2748 aesmc $dat1,$dat1 2749 aese $dat2,q14 2750 aesmc $dat2,$dat2 2751 aese $dat0,q15 2752 aese $dat1,q15 2753 aese $dat2,q15 2754 vld1.8 {$in2},[$inp],#16 2755 add $rounds,$rounds0,#2 2756 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2757 veor $tmp0,$tmp0,$dat0 2758 veor $tmp1,$tmp1,$dat1 2759 veor $dat2,$dat2,$tmp2 2760 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2761 vst1.8 {$tmp0},[$out],#16 2762 vst1.8 {$tmp1},[$out],#16 2763 vst1.8 {$dat2},[$out],#16 2764 cmn $len,#0x30 2765 b.eq .Lxts_enc_done 2766.Lxts_encxor_one: 2767 vorr $in3,$in1,$in1 2768 vorr $in4,$in2,$in2 2769 nop 2770 2771.Lxts_inner_enc_tail: 2772 cmn $len,#0x10 2773 veor $dat1,$in3,$iv0 2774 veor $dat2,$in4,$iv1 2775 b.eq .Lxts_enc_tail_loop 2776 veor $dat2,$in4,$iv0 2777.Lxts_enc_tail_loop: 2778 aese $dat1,q8 2779 aesmc $dat1,$dat1 2780 aese $dat2,q8 2781 aesmc $dat2,$dat2 2782 vld1.32 {q8},[$key_],#16 2783 subs $rounds,$rounds,#2 2784 aese $dat1,q9 2785 aesmc $dat1,$dat1 2786 aese $dat2,q9 2787 aesmc $dat2,$dat2 2788 vld1.32 {q9},[$key_],#16 2789 b.gt .Lxts_enc_tail_loop 2790 2791 aese $dat1,q8 2792 aesmc $dat1,$dat1 2793 aese $dat2,q8 2794 aesmc $dat2,$dat2 2795 aese $dat1,q9 2796 aesmc $dat1,$dat1 2797 aese $dat2,q9 2798 aesmc $dat2,$dat2 2799 aese $dat1,q12 2800 aesmc $dat1,$dat1 2801 aese $dat2,q12 2802 aesmc $dat2,$dat2 2803 cmn $len,#0x20 2804 aese $dat1,q13 2805 aesmc $dat1,$dat1 2806 aese $dat2,q13 2807 aesmc $dat2,$dat2 2808 veor $tmp1,$iv0,$rndlast 2809 aese $dat1,q14 2810 aesmc $dat1,$dat1 2811 aese $dat2,q14 2812 aesmc $dat2,$dat2 2813 veor $tmp2,$iv1,$rndlast 2814 aese $dat1,q15 2815 aese $dat2,q15 2816 b.eq .Lxts_enc_one 2817 veor $tmp1,$tmp1,$dat1 2818 vst1.8 {$tmp1},[$out],#16 2819 veor $tmp2,$tmp2,$dat2 2820 vorr $iv0,$iv1,$iv1 2821 vst1.8 {$tmp2},[$out],#16 2822 fmov $ivl,$ivd10 2823 fmov $ivh,$ivd11 2824 mov $constnum,#0x87 2825 extr $midnumx,$ivh,$ivh,#32 2826 extr $ivh,$ivh,$ivl,#63 2827 and $tmpmw,$constnum,$midnum,asr #31 2828 eor $ivl,$tmpmx,$ivl,lsl #1 2829 fmov $ivd00,$ivl 2830 fmov $ivd01,$ivh 2831 b .Lxts_enc_done 2832 2833.Lxts_enc_one: 2834 veor $tmp1,$tmp1,$dat2 2835 vorr $iv0,$iv0,$iv0 2836 vst1.8 {$tmp1},[$out],#16 2837 fmov $ivl,$ivd00 2838 fmov $ivh,$ivd01 2839 mov $constnum,#0x87 2840 extr $midnumx,$ivh,$ivh,#32 2841 extr $ivh,$ivh,$ivl,#63 2842 and $tmpmw,$constnum,$midnum,asr #31 2843 eor $ivl,$tmpmx,$ivl,lsl #1 2844 fmov $ivd00,$ivl 2845 fmov $ivd01,$ivh 2846 b .Lxts_enc_done 2847.align 5 2848.Lxts_enc_done: 2849 // Process the tail block with cipher stealing. 2850 tst $tailcnt,#0xf 2851 b.eq .Lxts_abort 2852 2853 mov $tmpinp,$inp 2854 mov $tmpoutp,$out 2855 sub $out,$out,#16 2856.composite_enc_loop: 2857 subs $tailcnt,$tailcnt,#1 2858 ldrb $l2outp,[$out,$tailcnt] 2859 ldrb $loutp,[$tmpinp,$tailcnt] 2860 strb $l2outp,[$tmpoutp,$tailcnt] 2861 strb $loutp,[$out,$tailcnt] 2862 b.gt .composite_enc_loop 2863.Lxts_enc_load_done: 2864 vld1.8 {$tmpin},[$out] 2865 veor $tmpin,$tmpin,$iv0 2866 2867 // Encrypt the composite block to get the last second encrypted text block 2868 ldr $rounds,[$key1,#240] // load key schedule... 2869 vld1.32 {$dat},[$key1],#16 2870 sub $rounds,$rounds,#2 2871 vld1.32 {$dat1},[$key1],#16 // load key schedule... 2872.Loop_final_enc: 2873 aese $tmpin,$dat0 2874 aesmc $tmpin,$tmpin 2875 vld1.32 {$dat0},[$key1],#16 2876 subs $rounds,$rounds,#2 2877 aese $tmpin,$dat1 2878 aesmc $tmpin,$tmpin 2879 vld1.32 {$dat1},[$key1],#16 2880 b.gt .Loop_final_enc 2881 2882 aese $tmpin,$dat0 2883 aesmc $tmpin,$tmpin 2884 vld1.32 {$dat0},[$key1] 2885 aese $tmpin,$dat1 2886 veor $tmpin,$tmpin,$dat0 2887 veor $tmpin,$tmpin,$iv0 2888 vst1.8 {$tmpin},[$out] 2889 2890.Lxts_abort: 2891 ldp $tailcnt,$midnumx,[sp,#48] 2892 ldp $ivd10,$ivd20,[sp,#32] 2893 ldp $ivd30,$ivd40,[sp,#16] 2894 ldp $constnumx,$tmpinp,[sp],#64 2895.Lxts_enc_final_abort: 2896 ret 2897.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt 2898___ 2899 2900}}} 2901{{{ 2902my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2903my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2904my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2905my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2906my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2907my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2908my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); 2909my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2910my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2911 2912my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2913 2914# q7 last round key 2915# q10-q15, q7 Last 7 round keys 2916# q8-q9 preloaded round keys except last 7 keys for big size 2917# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 2918 2919{ 2920my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2921 2922my ($dat3,$in3,$tmp3); # used only in 64-bit mode 2923my ($dat4,$in4,$tmp4); 2924if ($flavour =~ /64/) { 2925 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 2926} 2927 2928$code.=<<___ if ($flavour =~ /64/); 2929.globl ${prefix}_xts_decrypt 2930.type ${prefix}_xts_decrypt,%function 2931.align 5 2932${prefix}_xts_decrypt: 2933___ 2934$code.=<<___ if ($flavour =~ /64/); 2935 cmp $len,#16 2936 // Original input data size bigger than 16, jump to big size processing. 2937 b.ne .Lxts_dec_big_size 2938 // Encrypt the iv with key2, as the first XEX iv. 2939 ldr $rounds,[$key2,#240] 2940 vld1.32 {$dat},[$key2],#16 2941 vld1.8 {$iv0},[$ivp] 2942 sub $rounds,$rounds,#2 2943 vld1.32 {$dat1},[$key2],#16 2944 2945.Loop_dec_small_iv_enc: 2946 aese $iv0,$dat 2947 aesmc $iv0,$iv0 2948 vld1.32 {$dat},[$key2],#16 2949 subs $rounds,$rounds,#2 2950 aese $iv0,$dat1 2951 aesmc $iv0,$iv0 2952 vld1.32 {$dat1},[$key2],#16 2953 b.gt .Loop_dec_small_iv_enc 2954 2955 aese $iv0,$dat 2956 aesmc $iv0,$iv0 2957 vld1.32 {$dat},[$key2] 2958 aese $iv0,$dat1 2959 veor $iv0,$iv0,$dat 2960 2961 vld1.8 {$dat0},[$inp] 2962 veor $dat0,$iv0,$dat0 2963 2964 ldr $rounds,[$key1,#240] 2965 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 2966 2967 aesd $dat0,q20 2968 aesimc $dat0,$dat0 2969 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 2970 aesd $dat0,q21 2971 aesimc $dat0,$dat0 2972 subs $rounds,$rounds,#10 // bias 2973 b.eq .Lxts_128_dec 2974.Lxts_dec_round_loop: 2975 aesd $dat0,q8 2976 aesimc $dat0,$dat0 2977 vld1.32 {q8},[$key1],#16 // load key schedule... 2978 aesd $dat0,q9 2979 aesimc $dat0,$dat0 2980 vld1.32 {q9},[$key1],#16 // load key schedule... 2981 subs $rounds,$rounds,#2 // bias 2982 b.gt .Lxts_dec_round_loop 2983.Lxts_128_dec: 2984 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 2985 aesd $dat0,q8 2986 aesimc $dat0,$dat0 2987 aesd $dat0,q9 2988 aesimc $dat0,$dat0 2989 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 2990 aesd $dat0,q10 2991 aesimc $dat0,$dat0 2992 aesd $dat0,q11 2993 aesimc $dat0,$dat0 2994 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 2995 aesd $dat0,q12 2996 aesimc $dat0,$dat0 2997 aesd $dat0,q13 2998 aesimc $dat0,$dat0 2999 vld1.32 {$rndlast},[$key1] 3000 aesd $dat0,q14 3001 aesimc $dat0,$dat0 3002 aesd $dat0,q15 3003 veor $dat0,$dat0,$rndlast 3004 veor $dat0,$iv0,$dat0 3005 vst1.8 {$dat0},[$out] 3006 b .Lxts_dec_final_abort 3007.Lxts_dec_big_size: 3008___ 3009$code.=<<___ if ($flavour =~ /64/); 3010 stp $constnumx,$tmpinp,[sp,#-64]! 3011 stp $tailcnt,$midnumx,[sp,#48] 3012 stp $ivd10,$ivd20,[sp,#32] 3013 stp $ivd30,$ivd40,[sp,#16] 3014 3015 and $tailcnt,$len,#0xf 3016 and $len,$len,#-16 3017 subs $len,$len,#16 3018 mov $step,#16 3019 b.lo .Lxts_dec_abort 3020 3021 // Encrypt the iv with key2, as the first XEX iv 3022 ldr $rounds,[$key2,#240] 3023 vld1.32 {$dat},[$key2],#16 3024 vld1.8 {$iv0},[$ivp] 3025 sub $rounds,$rounds,#2 3026 vld1.32 {$dat1},[$key2],#16 3027 3028.Loop_dec_iv_enc: 3029 aese $iv0,$dat 3030 aesmc $iv0,$iv0 3031 vld1.32 {$dat},[$key2],#16 3032 subs $rounds,$rounds,#2 3033 aese $iv0,$dat1 3034 aesmc $iv0,$iv0 3035 vld1.32 {$dat1},[$key2],#16 3036 b.gt .Loop_dec_iv_enc 3037 3038 aese $iv0,$dat 3039 aesmc $iv0,$iv0 3040 vld1.32 {$dat},[$key2] 3041 aese $iv0,$dat1 3042 veor $iv0,$iv0,$dat 3043 3044 // The iv for second block 3045 // $ivl- iv(low), $ivh - iv(high) 3046 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3047 fmov $ivl,$ivd00 3048 fmov $ivh,$ivd01 3049 mov $constnum,#0x87 3050 extr $midnumx,$ivh,$ivh,#32 3051 extr $ivh,$ivh,$ivl,#63 3052 and $tmpmw,$constnum,$midnum,asr #31 3053 eor $ivl,$tmpmx,$ivl,lsl #1 3054 fmov $ivd10,$ivl 3055 fmov $ivd11,$ivh 3056 3057 ldr $rounds0,[$key1,#240] // load rounds number 3058 3059 // The iv for third block 3060 extr $midnumx,$ivh,$ivh,#32 3061 extr $ivh,$ivh,$ivl,#63 3062 and $tmpmw,$constnum,$midnum,asr #31 3063 eor $ivl,$tmpmx,$ivl,lsl #1 3064 fmov $ivd20,$ivl 3065 fmov $ivd21,$ivh 3066 3067 vld1.32 {q8-q9},[$key1] // load key schedule... 3068 sub $rounds0,$rounds0,#6 3069 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3070 sub $rounds0,$rounds0,#2 3071 vld1.32 {q10-q11},[$key_],#32 // load key schedule... 3072 vld1.32 {q12-q13},[$key_],#32 3073 vld1.32 {q14-q15},[$key_],#32 3074 vld1.32 {$rndlast},[$key_] 3075 3076 // The iv for fourth block 3077 extr $midnumx,$ivh,$ivh,#32 3078 extr $ivh,$ivh,$ivl,#63 3079 and $tmpmw,$constnum,$midnum,asr #31 3080 eor $ivl,$tmpmx,$ivl,lsl #1 3081 fmov $ivd30,$ivl 3082 fmov $ivd31,$ivh 3083 3084 add $key_,$key1,#32 3085 mov $rounds,$rounds0 3086 b .Lxts_dec 3087 3088 // Decryption 3089.align 5 3090.Lxts_dec: 3091 tst $tailcnt,#0xf 3092 b.eq .Lxts_dec_begin 3093 subs $len,$len,#16 3094 csel $step,xzr,$step,eq 3095 vld1.8 {$dat},[$inp],#16 3096 b.lo .Lxts_done 3097 sub $inp,$inp,#16 3098.Lxts_dec_begin: 3099 vld1.8 {$dat},[$inp],$step 3100 subs $len,$len,#32 // bias 3101 add $rounds,$rounds0,#2 3102 vorr $in1,$dat,$dat 3103 vorr $dat1,$dat,$dat 3104 vorr $in3,$dat,$dat 3105 vld1.8 {$dat2},[$inp],#16 3106 vorr $in2,$dat2,$dat2 3107 vorr $in4,$dat2,$dat2 3108 b.lo .Lxts_inner_dec_tail 3109 veor $dat,$dat,$iv0 // before decryt, xor with iv 3110 veor $dat2,$dat2,$iv1 3111 3112 vorr $dat1,$dat2,$dat2 3113 vld1.8 {$dat2},[$inp],#16 3114 vorr $in0,$dat,$dat 3115 vorr $in1,$dat1,$dat1 3116 veor $in2,$dat2,$iv2 // third block xox with third iv 3117 veor $dat2,$dat2,$iv2 3118 cmp $len,#32 3119 b.lo .Lxts_outer_dec_tail 3120 3121 vld1.8 {$dat3},[$inp],#16 3122 3123 // The iv for fifth block 3124 extr $midnumx,$ivh,$ivh,#32 3125 extr $ivh,$ivh,$ivl,#63 3126 and $tmpmw,$constnum,$midnum,asr #31 3127 eor $ivl,$tmpmx,$ivl,lsl #1 3128 fmov $ivd40,$ivl 3129 fmov $ivd41,$ivh 3130 3131 vld1.8 {$dat4},[$inp],#16 3132 veor $dat3,$dat3,$iv3 // the fourth block 3133 veor $dat4,$dat4,$iv4 3134 sub $len,$len,#32 // bias 3135 mov $rounds,$rounds0 3136 b .Loop5x_xts_dec 3137 3138.align 4 3139.Loop5x_xts_dec: 3140 aesd $dat0,q8 3141 aesimc $dat0,$dat0 3142 aesd $dat1,q8 3143 aesimc $dat1,$dat1 3144 aesd $dat2,q8 3145 aesimc $dat2,$dat2 3146 aesd $dat3,q8 3147 aesimc $dat3,$dat3 3148 aesd $dat4,q8 3149 aesimc $dat4,$dat4 3150 vld1.32 {q8},[$key_],#16 // load key schedule... 3151 subs $rounds,$rounds,#2 3152 aesd $dat0,q9 3153 aesimc $dat0,$dat0 3154 aesd $dat1,q9 3155 aesimc $dat1,$dat1 3156 aesd $dat2,q9 3157 aesimc $dat2,$dat2 3158 aesd $dat3,q9 3159 aesimc $dat3,$dat3 3160 aesd $dat4,q9 3161 aesimc $dat4,$dat4 3162 vld1.32 {q9},[$key_],#16 // load key schedule... 3163 b.gt .Loop5x_xts_dec 3164 3165 aesd $dat0,q8 3166 aesimc $dat0,$dat0 3167 aesd $dat1,q8 3168 aesimc $dat1,$dat1 3169 aesd $dat2,q8 3170 aesimc $dat2,$dat2 3171 aesd $dat3,q8 3172 aesimc $dat3,$dat3 3173 aesd $dat4,q8 3174 aesimc $dat4,$dat4 3175 subs $len,$len,#0x50 // because .Lxts_dec_tail4x 3176 3177 aesd $dat0,q9 3178 aesimc $dat0,$dat 3179 aesd $dat1,q9 3180 aesimc $dat1,$dat1 3181 aesd $dat2,q9 3182 aesimc $dat2,$dat2 3183 aesd $dat3,q9 3184 aesimc $dat3,$dat3 3185 aesd $dat4,q9 3186 aesimc $dat4,$dat4 3187 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3188 mov $key_,$key1 3189 3190 aesd $dat0,q10 3191 aesimc $dat0,$dat0 3192 aesd $dat1,q10 3193 aesimc $dat1,$dat1 3194 aesd $dat2,q10 3195 aesimc $dat2,$dat2 3196 aesd $dat3,q10 3197 aesimc $dat3,$dat3 3198 aesd $dat4,q10 3199 aesimc $dat4,$dat4 3200 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3201 // at exit from the loop v1.16b-v26.16b 3202 // are loaded with last "words" 3203 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x 3204 3205 aesd $dat0,q11 3206 aesimc $dat0,$dat0 3207 aesd $dat1,q11 3208 aesimc $dat1,$dat1 3209 aesd $dat2,q11 3210 aesimc $dat2,$dat2 3211 aesd $dat3,q11 3212 aesimc $dat3,$dat3 3213 aesd $dat4,q11 3214 aesimc $dat4,$dat4 3215 3216 aesd $dat0,q12 3217 aesimc $dat0,$dat0 3218 aesd $dat1,q12 3219 aesimc $dat1,$dat1 3220 aesd $dat2,q12 3221 aesimc $dat2,$dat2 3222 aesd $dat3,q12 3223 aesimc $dat3,$dat3 3224 aesd $dat4,q12 3225 aesimc $dat4,$dat4 3226 3227 aesd $dat0,q13 3228 aesimc $dat0,$dat0 3229 aesd $dat1,q13 3230 aesimc $dat1,$dat1 3231 aesd $dat2,q13 3232 aesimc $dat2,$dat2 3233 aesd $dat3,q13 3234 aesimc $dat3,$dat3 3235 aesd $dat4,q13 3236 aesimc $dat4,$dat4 3237 3238 aesd $dat0,q14 3239 aesimc $dat0,$dat0 3240 aesd $dat1,q14 3241 aesimc $dat1,$dat1 3242 aesd $dat2,q14 3243 aesimc $dat2,$dat2 3244 aesd $dat3,q14 3245 aesimc $dat3,$dat3 3246 aesd $dat4,q14 3247 aesimc $dat4,$dat4 3248 3249 veor $tmp0,$rndlast,$iv0 3250 aesd $dat0,q15 3251 // The iv for first block of next iteration. 3252 extr $midnumx,$ivh,$ivh,#32 3253 extr $ivh,$ivh,$ivl,#63 3254 and $tmpmw,$constnum,$midnum,asr #31 3255 eor $ivl,$tmpmx,$ivl,lsl #1 3256 fmov $ivd00,$ivl 3257 fmov $ivd01,$ivh 3258 veor $tmp1,$rndlast,$iv1 3259 vld1.8 {$in0},[$inp],#16 3260 aesd $dat1,q15 3261 // The iv for second block 3262 extr $midnumx,$ivh,$ivh,#32 3263 extr $ivh,$ivh,$ivl,#63 3264 and $tmpmw,$constnum,$midnum,asr #31 3265 eor $ivl,$tmpmx,$ivl,lsl #1 3266 fmov $ivd10,$ivl 3267 fmov $ivd11,$ivh 3268 veor $tmp2,$rndlast,$iv2 3269 vld1.8 {$in1},[$inp],#16 3270 aesd $dat2,q15 3271 // The iv for third block 3272 extr $midnumx,$ivh,$ivh,#32 3273 extr $ivh,$ivh,$ivl,#63 3274 and $tmpmw,$constnum,$midnum,asr #31 3275 eor $ivl,$tmpmx,$ivl,lsl #1 3276 fmov $ivd20,$ivl 3277 fmov $ivd21,$ivh 3278 veor $tmp3,$rndlast,$iv3 3279 vld1.8 {$in2},[$inp],#16 3280 aesd $dat3,q15 3281 // The iv for fourth block 3282 extr $midnumx,$ivh,$ivh,#32 3283 extr $ivh,$ivh,$ivl,#63 3284 and $tmpmw,$constnum,$midnum,asr #31 3285 eor $ivl,$tmpmx,$ivl,lsl #1 3286 fmov $ivd30,$ivl 3287 fmov $ivd31,$ivh 3288 veor $tmp4,$rndlast,$iv4 3289 vld1.8 {$in3},[$inp],#16 3290 aesd $dat4,q15 3291 3292 // The iv for fifth block 3293 extr $midnumx,$ivh,$ivh,#32 3294 extr $ivh,$ivh,$ivl,#63 3295 and $tmpmw,$constnum,$midnum,asr #31 3296 eor $ivl,$tmpmx,$ivl,lsl #1 3297 fmov $ivd40,$ivl 3298 fmov $ivd41,$ivh 3299 3300 vld1.8 {$in4},[$inp],#16 3301 cbz $xoffset,.Lxts_dec_tail4x 3302 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3303 veor $tmp0,$tmp0,$dat0 3304 veor $dat0,$in0,$iv0 3305 veor $tmp1,$tmp1,$dat1 3306 veor $dat1,$in1,$iv1 3307 veor $tmp2,$tmp2,$dat2 3308 veor $dat2,$in2,$iv2 3309 veor $tmp3,$tmp3,$dat3 3310 veor $dat3,$in3,$iv3 3311 veor $tmp4,$tmp4,$dat4 3312 vst1.8 {$tmp0},[$out],#16 3313 veor $dat4,$in4,$iv4 3314 vst1.8 {$tmp1},[$out],#16 3315 mov $rounds,$rounds0 3316 vst1.8 {$tmp2},[$out],#16 3317 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3318 vst1.8 {$tmp3},[$out],#16 3319 vst1.8 {$tmp4},[$out],#16 3320 b.hs .Loop5x_xts_dec 3321 3322 cmn $len,#0x10 3323 b.ne .Loop5x_dec_after 3324 // If x2($len) equal to -0x10, the left blocks is 4. 3325 // After specially processing, utilize the five blocks processing again. 3326 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. 3327 vorr $iv4,$iv3,$iv3 3328 vorr $iv3,$iv2,$iv2 3329 vorr $iv2,$iv1,$iv1 3330 vorr $iv1,$iv0,$iv0 3331 fmov $ivl,$ivd40 3332 fmov $ivh,$ivd41 3333 veor $dat0,$iv0,$in0 3334 veor $dat1,$iv1,$in1 3335 veor $dat2,$in2,$iv2 3336 veor $dat3,$in3,$iv3 3337 veor $dat4,$in4,$iv4 3338 b.eq .Loop5x_xts_dec 3339 3340.Loop5x_dec_after: 3341 add $len,$len,#0x50 3342 cbz $len,.Lxts_done 3343 3344 add $rounds,$rounds0,#2 3345 subs $len,$len,#0x30 3346 b.lo .Lxts_inner_dec_tail 3347 3348 veor $dat0,$iv0,$in2 3349 veor $dat1,$iv1,$in3 3350 veor $dat2,$in4,$iv2 3351 b .Lxts_outer_dec_tail 3352 3353.align 4 3354.Lxts_dec_tail4x: 3355 add $inp,$inp,#16 3356 tst $tailcnt,#0xf 3357 veor $tmp1,$dat1,$tmp0 3358 vst1.8 {$tmp1},[$out],#16 3359 veor $tmp2,$dat2,$tmp2 3360 vst1.8 {$tmp2},[$out],#16 3361 veor $tmp3,$dat3,$tmp3 3362 veor $tmp4,$dat4,$tmp4 3363 vst1.8 {$tmp3-$tmp4},[$out],#32 3364 3365 b.eq .Lxts_dec_abort 3366 vld1.8 {$dat0},[$inp],#16 3367 b .Lxts_done 3368.align 4 3369.Lxts_outer_dec_tail: 3370 aesd $dat0,q8 3371 aesimc $dat0,$dat0 3372 aesd $dat1,q8 3373 aesimc $dat1,$dat1 3374 aesd $dat2,q8 3375 aesimc $dat2,$dat2 3376 vld1.32 {q8},[$key_],#16 3377 subs $rounds,$rounds,#2 3378 aesd $dat0,q9 3379 aesimc $dat0,$dat0 3380 aesd $dat1,q9 3381 aesimc $dat1,$dat1 3382 aesd $dat2,q9 3383 aesimc $dat2,$dat2 3384 vld1.32 {q9},[$key_],#16 3385 b.gt .Lxts_outer_dec_tail 3386 3387 aesd $dat0,q8 3388 aesimc $dat0,$dat0 3389 aesd $dat1,q8 3390 aesimc $dat1,$dat1 3391 aesd $dat2,q8 3392 aesimc $dat2,$dat2 3393 veor $tmp0,$iv0,$rndlast 3394 subs $len,$len,#0x30 3395 // The iv for first block 3396 fmov $ivl,$ivd20 3397 fmov $ivh,$ivd21 3398 mov $constnum,#0x87 3399 extr $midnumx,$ivh,$ivh,#32 3400 extr $ivh,$ivh,$ivl,#63 3401 and $tmpmw,$constnum,$midnum,asr #31 3402 eor $ivl,$tmpmx,$ivl,lsl #1 3403 fmov $ivd00,$ivl 3404 fmov $ivd01,$ivh 3405 veor $tmp1,$iv1,$rndlast 3406 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 3407 aesd $dat0,q9 3408 aesimc $dat0,$dat0 3409 aesd $dat1,q9 3410 aesimc $dat1,$dat1 3411 aesd $dat2,q9 3412 aesimc $dat2,$dat2 3413 veor $tmp2,$iv2,$rndlast 3414 // The iv for second block 3415 extr $midnumx,$ivh,$ivh,#32 3416 extr $ivh,$ivh,$ivl,#63 3417 and $tmpmw,$constnum,$midnum,asr #31 3418 eor $ivl,$tmpmx,$ivl,lsl #1 3419 fmov $ivd10,$ivl 3420 fmov $ivd11,$ivh 3421 3422 add $xoffset,$xoffset,#0x20 3423 add $inp,$inp,$xoffset // $inp is adjusted to the last data 3424 3425 mov $key_,$key1 3426 3427 // The iv for third block 3428 extr $midnumx,$ivh,$ivh,#32 3429 extr $ivh,$ivh,$ivl,#63 3430 and $tmpmw,$constnum,$midnum,asr #31 3431 eor $ivl,$tmpmx,$ivl,lsl #1 3432 fmov $ivd20,$ivl 3433 fmov $ivd21,$ivh 3434 3435 aesd $dat0,q12 3436 aesimc $dat0,$dat0 3437 aesd $dat1,q12 3438 aesimc $dat1,$dat1 3439 aesd $dat2,q12 3440 aesimc $dat2,$dat2 3441 aesd $dat0,q13 3442 aesimc $dat0,$dat0 3443 aesd $dat1,q13 3444 aesimc $dat1,$dat1 3445 aesd $dat2,q13 3446 aesimc $dat2,$dat2 3447 aesd $dat0,q14 3448 aesimc $dat0,$dat0 3449 aesd $dat1,q14 3450 aesimc $dat1,$dat1 3451 aesd $dat2,q14 3452 aesimc $dat2,$dat2 3453 vld1.8 {$in2},[$inp],#16 3454 aesd $dat0,q15 3455 aesd $dat1,q15 3456 aesd $dat2,q15 3457 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3458 add $rounds,$rounds0,#2 3459 veor $tmp0,$tmp0,$dat0 3460 veor $tmp1,$tmp1,$dat1 3461 veor $dat2,$dat2,$tmp2 3462 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3463 vst1.8 {$tmp0},[$out],#16 3464 vst1.8 {$tmp1},[$out],#16 3465 vst1.8 {$dat2},[$out],#16 3466 3467 cmn $len,#0x30 3468 add $len,$len,#0x30 3469 b.eq .Lxts_done 3470 sub $len,$len,#0x30 3471 vorr $in3,$in1,$in1 3472 vorr $in4,$in2,$in2 3473 nop 3474 3475.Lxts_inner_dec_tail: 3476 // $len == -0x10 means two blocks left. 3477 cmn $len,#0x10 3478 veor $dat1,$in3,$iv0 3479 veor $dat2,$in4,$iv1 3480 b.eq .Lxts_dec_tail_loop 3481 veor $dat2,$in4,$iv0 3482.Lxts_dec_tail_loop: 3483 aesd $dat1,q8 3484 aesimc $dat1,$dat1 3485 aesd $dat2,q8 3486 aesimc $dat2,$dat2 3487 vld1.32 {q8},[$key_],#16 3488 subs $rounds,$rounds,#2 3489 aesd $dat1,q9 3490 aesimc $dat1,$dat1 3491 aesd $dat2,q9 3492 aesimc $dat2,$dat2 3493 vld1.32 {q9},[$key_],#16 3494 b.gt .Lxts_dec_tail_loop 3495 3496 aesd $dat1,q8 3497 aesimc $dat1,$dat1 3498 aesd $dat2,q8 3499 aesimc $dat2,$dat2 3500 aesd $dat1,q9 3501 aesimc $dat1,$dat1 3502 aesd $dat2,q9 3503 aesimc $dat2,$dat2 3504 aesd $dat1,q12 3505 aesimc $dat1,$dat1 3506 aesd $dat2,q12 3507 aesimc $dat2,$dat2 3508 cmn $len,#0x20 3509 aesd $dat1,q13 3510 aesimc $dat1,$dat1 3511 aesd $dat2,q13 3512 aesimc $dat2,$dat2 3513 veor $tmp1,$iv0,$rndlast 3514 aesd $dat1,q14 3515 aesimc $dat1,$dat1 3516 aesd $dat2,q14 3517 aesimc $dat2,$dat2 3518 veor $tmp2,$iv1,$rndlast 3519 aesd $dat1,q15 3520 aesd $dat2,q15 3521 b.eq .Lxts_dec_one 3522 veor $tmp1,$tmp1,$dat1 3523 veor $tmp2,$tmp2,$dat2 3524 vorr $iv0,$iv2,$iv2 3525 vorr $iv1,$iv3,$iv3 3526 vst1.8 {$tmp1},[$out],#16 3527 vst1.8 {$tmp2},[$out],#16 3528 add $len,$len,#16 3529 b .Lxts_done 3530 3531.Lxts_dec_one: 3532 veor $tmp1,$tmp1,$dat2 3533 vorr $iv0,$iv1,$iv1 3534 vorr $iv1,$iv2,$iv2 3535 vst1.8 {$tmp1},[$out],#16 3536 add $len,$len,#32 3537 3538.Lxts_done: 3539 tst $tailcnt,#0xf 3540 b.eq .Lxts_dec_abort 3541 // Processing the last two blocks with cipher stealing. 3542 mov x7,x3 3543 cbnz x2,.Lxts_dec_1st_done 3544 vld1.8 {$dat0},[$inp],#16 3545 3546 // Decrypt the last secod block to get the last plain text block 3547.Lxts_dec_1st_done: 3548 eor $tmpin,$dat0,$iv1 3549 ldr $rounds,[$key1,#240] 3550 vld1.32 {$dat0},[$key1],#16 3551 sub $rounds,$rounds,#2 3552 vld1.32 {$dat1},[$key1],#16 3553.Loop_final_2nd_dec: 3554 aesd $tmpin,$dat0 3555 aesimc $tmpin,$tmpin 3556 vld1.32 {$dat0},[$key1],#16 // load key schedule... 3557 subs $rounds,$rounds,#2 3558 aesd $tmpin,$dat1 3559 aesimc $tmpin,$tmpin 3560 vld1.32 {$dat1},[$key1],#16 // load key schedule... 3561 b.gt .Loop_final_2nd_dec 3562 3563 aesd $tmpin,$dat0 3564 aesimc $tmpin,$tmpin 3565 vld1.32 {$dat0},[$key1] 3566 aesd $tmpin,$dat1 3567 veor $tmpin,$tmpin,$dat0 3568 veor $tmpin,$tmpin,$iv1 3569 vst1.8 {$tmpin},[$out] 3570 3571 mov $tmpinp,$inp 3572 add $tmpoutp,$out,#16 3573 3574 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3575 // to get the last encrypted block. 3576.composite_dec_loop: 3577 subs $tailcnt,$tailcnt,#1 3578 ldrb $l2outp,[$out,$tailcnt] 3579 ldrb $loutp,[$tmpinp,$tailcnt] 3580 strb $l2outp,[$tmpoutp,$tailcnt] 3581 strb $loutp,[$out,$tailcnt] 3582 b.gt .composite_dec_loop 3583.Lxts_dec_load_done: 3584 vld1.8 {$tmpin},[$out] 3585 veor $tmpin,$tmpin,$iv0 3586 3587 // Decrypt the composite block to get the last second plain text block 3588 ldr $rounds,[$key_,#240] 3589 vld1.32 {$dat},[$key_],#16 3590 sub $rounds,$rounds,#2 3591 vld1.32 {$dat1},[$key_],#16 3592.Loop_final_dec: 3593 aesd $tmpin,$dat0 3594 aesimc $tmpin,$tmpin 3595 vld1.32 {$dat0},[$key_],#16 // load key schedule... 3596 subs $rounds,$rounds,#2 3597 aesd $tmpin,$dat1 3598 aesimc $tmpin,$tmpin 3599 vld1.32 {$dat1},[$key_],#16 // load key schedule... 3600 b.gt .Loop_final_dec 3601 3602 aesd $tmpin,$dat0 3603 aesimc $tmpin,$tmpin 3604 vld1.32 {$dat0},[$key_] 3605 aesd $tmpin,$dat1 3606 veor $tmpin,$tmpin,$dat0 3607 veor $tmpin,$tmpin,$iv0 3608 vst1.8 {$tmpin},[$out] 3609 3610.Lxts_dec_abort: 3611 ldp $tailcnt,$midnumx,[sp,#48] 3612 ldp $ivd10,$ivd20,[sp,#32] 3613 ldp $ivd30,$ivd40,[sp,#16] 3614 ldp $constnumx,$tmpinp,[sp],#64 3615 3616.Lxts_dec_final_abort: 3617 ret 3618.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt 3619___ 3620} 3621}}} 3622$code.=<<___; 3623#endif 3624___ 3625######################################## 3626if ($flavour =~ /64/) { ######## 64-bit code 3627 my %opcode = ( 3628 "aesd" => 0x4e285800, "aese" => 0x4e284800, 3629 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 3630 3631 local *unaes = sub { 3632 my ($mnemonic,$arg)=@_; 3633 3634 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 3635 sprintf ".inst\t0x%08x\t//%s %s", 3636 $opcode{$mnemonic}|$1|($2<<5), 3637 $mnemonic,$arg; 3638 }; 3639 3640 foreach(split("\n",$code)) { 3641 s/\`([^\`]*)\`/eval($1)/geo; 3642 3643 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 3644 s/@\s/\/\//o; # old->new style commentary 3645 3646 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 3647 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 3648 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 3649 s/vmov\.i8/movi/o or # fix up legacy mnemonics 3650 s/vext\.8/ext/o or 3651 s/vrev32\.8/rev32/o or 3652 s/vtst\.8/cmtst/o or 3653 s/vshr/ushr/o or 3654 s/^(\s+)v/$1/o or # strip off v prefix 3655 s/\bbx\s+lr\b/ret/o; 3656 3657 # fix up remaining legacy suffixes 3658 s/\.[ui]?8//o; 3659 m/\],#8/o and s/\.16b/\.8b/go; 3660 s/\.[ui]?32//o and s/\.16b/\.4s/go; 3661 s/\.[ui]?64//o and s/\.16b/\.2d/go; 3662 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 3663 3664 print $_,"\n"; 3665 } 3666} else { ######## 32-bit code 3667 my %opcode = ( 3668 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 3669 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 3670 3671 local *unaes = sub { 3672 my ($mnemonic,$arg)=@_; 3673 3674 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 3675 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 3676 |(($2&7)<<1) |(($2&8)<<2); 3677 # since ARMv7 instructions are always encoded little-endian. 3678 # correct solution is to use .inst directive, but older 3679 # assemblers don't implement it:-( 3680 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 3681 $word&0xff,($word>>8)&0xff, 3682 ($word>>16)&0xff,($word>>24)&0xff, 3683 $mnemonic,$arg; 3684 } 3685 }; 3686 3687 sub unvtbl { 3688 my $arg=shift; 3689 3690 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 3691 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 3692 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 3693 } 3694 3695 sub unvdup32 { 3696 my $arg=shift; 3697 3698 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 3699 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 3700 } 3701 3702 sub unvmov32 { 3703 my $arg=shift; 3704 3705 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 3706 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 3707 } 3708 3709 foreach(split("\n",$code)) { 3710 s/\`([^\`]*)\`/eval($1)/geo; 3711 3712 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 3713 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 3714 s/\/\/\s?/@ /o; # new->old style commentary 3715 3716 # fix up remaining new-style suffixes 3717 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 3718 s/\],#[0-9]+/]!/o; 3719 3720 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 3721 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 3722 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 3723 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 3724 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 3725 s/^(\s+)b\./$1b/o or 3726 s/^(\s+)ret/$1bx\tlr/o; 3727 3728 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 3729 print " it $2\n"; 3730 } 3731 3732 print $_,"\n"; 3733 } 3734} 3735 3736close STDOUT or die "error closing STDOUT: $!"; 3737