• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_XTS)
18
19#include "crypt_aes_macro_armv8.s"
20#include "crypt_arm.h"
21
22.file    "crypt_aes_xts_armv8.S"
23.text
24.arch    armv8-a+crypto
25
26KEY     .req    x0
27IN      .req    x1
28OUT     .req    x2
29LEN     .req    x3
30TWEAK   .req    x4
31TMPOUT  .req    x17
32WP      .req    w11
33WC      .req    w12
34
35KTMP    .req    x5
36LTMP    .req    x6
37TAILNUM .req    x8
38POS     .req    x16
39
40ROUNDS  .req    w7
41XROUNDS  .req   x7
42TROUNDS .req    w15
43
44WTMP0   .req    w9
45WTMP1   .req    w10
46WTMP2   .req    w11
47WTMP3   .req    w12
48
49XTMP1   .req    x10
50XTMP2   .req    x11
51
52TWX0    .req    x13
53TWX1    .req    x14
54TWW1    .req    w14
55
56BLK0    .req    v0
57BLK1    .req    v1
58BLK2    .req    v2
59BLK3    .req    v3
60BLK4    .req    v4
61
62IN0   .req    v5
63IN1   .req    v6
64IN2   .req    v7
65IN3   .req    v30
66IN4   .req    v31
67
68TWK0  .req    v8
69TWK1  .req    v9
70TWK2  .req    v10
71TWK3  .req    v11
72TWK4  .req    v12
73
74TWKD00   .req    d8
75TWKD10   .req    d9
76
77TWKD20   .req    d10
78TWKD30   .req    d11
79TWKD40   .req    d12
80
81#define TWKD01   v8.d[1]
82#define TWKD11   v9.d[1]
83#define TWKD21   v10.d[1]
84#define TWKD31   v11.d[1]
85#define TWKD41   v12.d[1]
86
87RDK0    .req    v16
88RDK1    .req    v17
89RDK2    .req    v18
90RDK3    .req    v19
91RDK4    .req    v20
92RDK5    .req    v21
93RDK6    .req    v22
94RDK7    .req    v23
95RDK8    .req    v24
96
97TMP0    .req    v25
98TMP1    .req    v26
99TMP2    .req    v27
100TMP3    .req    v28
101TMP4    .req    v29
102
103
104#define MOV_REG_TO_VEC(SRC0, SRC1, DES0, DES1)        \
105    fmov DES0,SRC0                        ; \
106    fmov DES1,SRC1                        ; \
107
108.macro NextTweak twkl, twkh, twkd0, twkd1
109asr XTMP2,\twkh,#63
110extr \twkh,\twkh,\twkl,#63
111and WTMP1,WTMP0,WTMP2
112eor \twkl,XTMP1,\twkl,lsl#1
113fmov \twkd0,\twkl  // must set lower bits of 'q' register first.1
114fmov \twkd1,\twkh  // Set lower bits using 'd' register will clear higer bits.
115.endm
116
117.macro AesCrypt1x en, mc, d0, rk
118aes\en \d0\().16b, \rk\().16b
119aes\mc \d0\().16b, \d0\().16b
120.endm
121
122.macro AesEncrypt1x d0, rk
123AesCrypt1x e, mc, \d0, \rk
124.endm
125
126.macro AesDecrypt1x d0, rk
127AesCrypt1x d, imc, \d0, \rk
128.endm
129
130/**
131 * int32_t CRYPT_AES_XTS_Encrypt(const CRYPT_AES_Key *ctx, const uint8_t *in, uint8_t *out, uint32_t len, const uint8_t *tweak);
132 */
133.globl CRYPT_AES_XTS_Encrypt
134.type CRYPT_AES_XTS_Encrypt, %function
135.align 4
136CRYPT_AES_XTS_Encrypt:
137AARCH64_PACIASP
138    stp x29, x30, [sp,#-80]!
139    add x29, sp, #0
140    stp d8, d9, [sp,#16]
141    stp d10, d11, [sp,#32]
142    stp d12, d13, [sp,#48]
143    stp d14, d15, [sp,#64]
144
145    ld1 {TWK0.16b}, [TWEAK]
146    and TAILNUM, LEN, #0xF   // get tail num, LEN % 16
147    and LTMP, LEN, #-16
148    mov WTMP0,0x87
149    ldr ROUNDS,[KEY,#240]
150    fmov TWX0,TWKD00
151    fmov TWX1,TWKD01
152
153    sub ROUNDS,ROUNDS,#6   // perload last 7 rounds key
154    add KTMP,KEY,XROUNDS,lsl#4
155    ld1 {RDK2.4s,RDK3.4s},[KTMP],#32
156    ld1 {RDK4.4s,RDK5.4s},[KTMP],#32
157    ld1 {RDK6.4s,RDK7.4s},[KTMP],#32
158    ld1 {RDK8.4s},[KTMP]
159
160.Lxts_aesenc_start:
161    cmp LTMP, #80
162    b.ge .Lxts_enc_proc_5_blks
163    cmp LTMP, #48
164    b.ge .Lxts_enc_proc_3_blks
165    cmp LTMP, #32
166    b.eq .Lxts_enc_proc_2_blks
167    cmp LTMP, #16
168    b.eq .Lxts_enc_proc_1blk
169
170.Lxtx_tail_blk:
171    fmov TWX0,TWKD00  // reset already computed tweak
172    fmov TWX1,TWKD01
173    cbz TAILNUM,.Lxts_aesenc_finish
174    // prepare encrypt tail block
175    sub TMPOUT,OUT,#16
176.Lxtx_tail_blk_loop:
177    subs TAILNUM,TAILNUM,1
178    ldrb WC,[TMPOUT,TAILNUM]
179    ldrb WP,[IN,TAILNUM]
180    strb WC,[OUT,TAILNUM]
181    strb WP,[TMPOUT,TAILNUM]
182    b.gt .Lxtx_tail_blk_loop
183    ld1 {BLK0.16b}, [TMPOUT]
184    mov LTMP,#16
185    mov OUT,TMPOUT
186    b .Lxts_enc_proc_1blk_loaded
187
188    cbz LTMP,.Lxts_aesenc_finish
189
190.Lxts_enc_proc_1blk:
191    ld1 {BLK0.16b},[IN],#16
192.Lxts_enc_proc_1blk_loaded:
193    eor BLK0.16b,BLK0.16b,TWK0.16b
194    mov KTMP, KEY
195    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
196    sub TROUNDS,ROUNDS,#2
197.Lxts_rounds_1blks:
198    AesEncrypt1x BLK0,RDK0
199    ld1 {RDK0.4s},[KTMP],#16
200    subs TROUNDS,TROUNDS,#2
201
202    AesEncrypt1x BLK0,RDK1
203    ld1 {RDK1.4s},[KTMP],#16
204    b.gt .Lxts_rounds_1blks
205
206    AesEncrypt1x BLK0,RDK0
207    AesEncrypt1x BLK0,RDK1
208
209    // last 7 rounds
210    AesEncrypt1x BLK0,RDK2
211    AesEncrypt1x BLK0,RDK3
212    AesEncrypt1x BLK0,RDK4
213    AesEncrypt1x BLK0,RDK5
214    AesEncrypt1x BLK0,RDK6
215
216    aese BLK0.16b,RDK7.16b  // final round
217    eor BLK0.16b,BLK0.16b,RDK8.16b
218    eor BLK0.16b,BLK0.16b,TWK0.16b
219
220    st1 {BLK0.16b}, [OUT], #16
221
222    NextTweak TWX0,TWX1,TWKD00,TWKD01
223
224    subs LTMP,LTMP,#16
225    b.hs .Lxts_aesenc_start
226
227.Lxts_enc_proc_2_blks:
228    ld1 {BLK0.16b, BLK1.16b}, [IN], #32
229    mov KTMP, KEY
230    NextTweak TWX0,TWX1,TWKD10,TWKD11
231    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
232    sub TROUNDS,ROUNDS,#2
233    eor BLK0.16b, BLK0.16b, TWK0.16b
234    eor BLK1.16b, BLK1.16b, TWK1.16b
235.Lxts_rounds_2blks:
236    AesEncrypt1x BLK0,RDK0
237    AesEncrypt1x BLK1,RDK0
238    ld1 {RDK0.4s},[KTMP],#16
239    subs TROUNDS,TROUNDS,#2
240
241    AesEncrypt1x BLK0,RDK1
242    AesEncrypt1x BLK1,RDK1
243    ld1 {RDK1.4s},[KTMP],#16
244    b.gt .Lxts_rounds_2blks
245
246    AesEncrypt1x BLK0,RDK0
247    AesEncrypt1x BLK1,RDK0
248
249    AesEncrypt1x BLK0,RDK1
250    AesEncrypt1x BLK1,RDK1
251
252    // last 7 rounds
253    AesEncrypt1x BLK0,RDK2
254    AesEncrypt1x BLK1,RDK2
255
256    AesEncrypt1x BLK0,RDK3
257    AesEncrypt1x BLK1,RDK3
258
259    AesEncrypt1x BLK0,RDK4
260    AesEncrypt1x BLK1,RDK4
261
262    AesEncrypt1x BLK0,RDK5
263    AesEncrypt1x BLK1,RDK5
264
265    AesEncrypt1x BLK0,RDK6
266    AesEncrypt1x BLK1,RDK6
267
268    eor TWK0.16b,TWK0.16b,RDK8.16b
269    eor TWK1.16b,TWK1.16b,RDK8.16b
270
271    aese BLK0.16b,RDK7.16b  // final round
272    aese BLK1.16b,RDK7.16b
273
274    eor BLK0.16b,BLK0.16b,TWK0.16b
275    eor BLK1.16b,BLK1.16b,TWK1.16b
276
277    st1 {BLK0.16b, BLK1.16b}, [OUT], #32
278    NextTweak TWX0,TWX1,TWKD00,TWKD01
279    subs LTMP,LTMP,#32
280    b.hs .Lxts_aesenc_start
281
282.Lxts_enc_proc_3_blks:
283    ld1 {BLK0.16b}, [IN], #16   // first block
284    NextTweak TWX0,TWX1,TWKD10,TWKD11
285    eor BLK0.16b,BLK0.16b,TWK0.16b
286
287    ld1 {BLK1.16b}, [IN], #16   // second block
288    NextTweak TWX0,TWX1,TWKD20,TWKD21
289    eor BLK1.16b,BLK1.16b,TWK1.16b
290
291    ld1 {BLK2.16b}, [IN], #16   // third block
292    eor BLK2.16b,BLK2.16b,TWK2.16b
293
294    mov KTMP, KEY
295    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
296    sub TROUNDS,ROUNDS,#2
297
298.Lxts_rounds_3blks:
299    AesEncrypt1x BLK0,RDK0
300    AesEncrypt1x BLK1,RDK0
301    AesEncrypt1x BLK2,RDK0
302    ld1 {RDK0.4s},[KTMP],#16
303    subs TROUNDS,TROUNDS,#2
304
305    AesEncrypt1x BLK0,RDK1
306    AesEncrypt1x BLK1,RDK1
307    AesEncrypt1x BLK2,RDK1
308    ld1 {RDK1.4s},[KTMP],#16
309    b.gt .Lxts_rounds_3blks
310
311    AesEncrypt1x BLK0,RDK0
312    AesEncrypt1x BLK1,RDK0
313    AesEncrypt1x BLK2,RDK0
314
315    AesEncrypt1x BLK0,RDK1
316    AesEncrypt1x BLK1,RDK1
317    AesEncrypt1x BLK2,RDK1
318
319    // last 7 rounds
320    AesEncrypt1x BLK0,RDK2
321    AesEncrypt1x BLK1,RDK2
322    AesEncrypt1x BLK2,RDK2
323
324    AesEncrypt1x BLK0,RDK3
325    AesEncrypt1x BLK1,RDK3
326    AesEncrypt1x BLK2,RDK3
327
328    AesEncrypt1x BLK0,RDK4
329    AesEncrypt1x BLK1,RDK4
330    AesEncrypt1x BLK2,RDK4
331
332    AesEncrypt1x BLK0,RDK5
333    AesEncrypt1x BLK1,RDK5
334    AesEncrypt1x BLK2,RDK5
335
336    AesEncrypt1x BLK0,RDK6
337    AesEncrypt1x BLK1,RDK6
338    AesEncrypt1x BLK2,RDK6
339
340    eor TWK0.16b,TWK0.16b,RDK8.16b
341    eor TWK1.16b,TWK1.16b,RDK8.16b
342    eor TWK2.16b,TWK2.16b,RDK8.16b
343
344    aese BLK0.16b,RDK7.16b
345    aese BLK1.16b,RDK7.16b
346    aese BLK2.16b,RDK7.16b
347
348    eor BLK0.16b,BLK0.16b,TWK0.16b
349    eor BLK1.16b,BLK1.16b,TWK1.16b
350    eor BLK2.16b,BLK2.16b,TWK2.16b
351
352    st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [OUT], #48
353
354    NextTweak TWX0,TWX1,TWKD00,TWKD01
355
356    subs LTMP,LTMP,#48
357    b.hs .Lxts_aesenc_start
358
359.align 4
360.Lxts_enc_proc_5_blks:
361    ld1 {BLK0.16b}, [IN], #16   // first block
362    NextTweak TWX0,TWX1,TWKD10,TWKD11
363    eor BLK0.16b,BLK0.16b,TWK0.16b
364
365    ld1 {BLK1.16b}, [IN], #16   // second block
366    NextTweak TWX0,TWX1,TWKD20,TWKD21
367    eor BLK1.16b,BLK1.16b,TWK1.16b
368    sub LTMP,LTMP,#32
369
370    ld1 {BLK2.16b}, [IN], #16   // third block
371    NextTweak TWX0,TWX1,TWKD30,TWKD31
372    eor BLK2.16b,BLK2.16b,TWK2.16b
373
374    ld1 {BLK3.16b}, [IN], #16   // fourth block
375    NextTweak TWX0,TWX1,TWKD40,TWKD41
376    eor BLK3.16b,BLK3.16b,TWK3.16b
377    sub LTMP,LTMP,#32
378
379    ld1 {BLK4.16b}, [IN], #16   // fifth block
380    eor BLK4.16b, BLK4.16b, TWK4.16b
381    sub LTMP,LTMP,#16
382
383    mov KTMP, KEY
384    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
385    sub TROUNDS,ROUNDS,#2
386.align 4
387.Lxts_rounds_5blks:
388    AesEncrypt1x BLK0,RDK0
389    AesEncrypt1x BLK1,RDK0
390    AesEncrypt1x BLK2,RDK0
391    AesEncrypt1x BLK3,RDK0
392    AesEncrypt1x BLK4,RDK0
393    ld1 {RDK0.4s},[KTMP],#16
394    subs TROUNDS,TROUNDS,#2
395
396    AesEncrypt1x BLK0,RDK1
397    AesEncrypt1x BLK1,RDK1
398    AesEncrypt1x BLK2,RDK1
399    AesEncrypt1x BLK3,RDK1
400    AesEncrypt1x BLK4,RDK1
401    ld1 {RDK1.4s},[KTMP],#16
402    b.gt .Lxts_rounds_5blks
403
404    AesEncrypt1x BLK0,RDK0
405    AesEncrypt1x BLK1,RDK0
406    AesEncrypt1x BLK2,RDK0
407    AesEncrypt1x BLK3,RDK0
408    AesEncrypt1x BLK4,RDK0
409    subs LTMP,LTMP,#80
410
411    AesEncrypt1x BLK0,RDK1
412    AesEncrypt1x BLK1,RDK1
413    AesEncrypt1x BLK2,RDK1
414    AesEncrypt1x BLK3,RDK1
415    AesEncrypt1x BLK4,RDK1
416
417    // last 7 rounds
418    AesEncrypt1x BLK0,RDK2
419    AesEncrypt1x BLK1,RDK2
420    AesEncrypt1x BLK2,RDK2
421    AesEncrypt1x BLK3,RDK2
422    AesEncrypt1x BLK4,RDK2
423    csel POS,xzr,LTMP,gt  //
424
425    AesEncrypt1x BLK0,RDK3
426    AesEncrypt1x BLK1,RDK3
427    AesEncrypt1x BLK2,RDK3
428    AesEncrypt1x BLK3,RDK3
429    AesEncrypt1x BLK4,RDK3
430    add IN,IN,POS
431
432    AesEncrypt1x BLK0,RDK4
433    AesEncrypt1x BLK1,RDK4
434    AesEncrypt1x BLK2,RDK4
435    AesEncrypt1x BLK3,RDK4
436    AesEncrypt1x BLK4,RDK4
437
438    AesEncrypt1x BLK0,RDK5
439    AesEncrypt1x BLK1,RDK5
440    AesEncrypt1x BLK2,RDK5
441    AesEncrypt1x BLK3,RDK5
442    AesEncrypt1x BLK4,RDK5
443
444    AesEncrypt1x BLK0,RDK6
445    AesEncrypt1x BLK1,RDK6
446    AesEncrypt1x BLK2,RDK6
447    AesEncrypt1x BLK3,RDK6
448    AesEncrypt1x BLK4,RDK6
449
450    eor TMP0.16b,TWK0.16b,RDK8.16b
451    aese BLK0.16b,RDK7.16b  // final round
452    NextTweak TWX0,TWX1,TWKD00,TWKD01  // perform operations of next 5blks in advance
453
454    eor TMP1.16b,TWK1.16b,RDK8.16b
455    ld1 {IN0.16b}, [IN], #16
456    aese BLK1.16b,RDK7.16b
457    NextTweak TWX0,TWX1,TWKD10,TWKD11
458
459    eor TMP2.16b,TWK2.16b,RDK8.16b
460    ld1 {IN1.16b}, [IN], #16
461    aese BLK2.16b,RDK7.16b
462    NextTweak TWX0,TWX1,TWKD20,TWKD21
463
464    eor TMP3.16b,TWK3.16b,RDK8.16b
465    ld1 {IN2.16b}, [IN], #16
466    aese BLK3.16b,RDK7.16b
467    NextTweak TWX0,TWX1,TWKD30,TWKD31
468
469    eor TMP4.16b,TWK4.16b,RDK8.16b
470    ld1 {IN3.16b}, [IN], #16
471    aese BLK4.16b,RDK7.16b
472    NextTweak TWX0,TWX1,TWKD40,TWKD41
473
474    ld1 {IN4.16b}, [IN], #16
475    mov KTMP, KEY
476    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
477    eor TMP0.16b,TMP0.16b,BLK0.16b
478    eor BLK0.16b,IN0.16b,TWK0.16b  // blk0 = in0 ^ twk0
479    eor TMP1.16b,TMP1.16b,BLK1.16b
480    eor BLK1.16b,IN1.16b,TWK1.16b
481    st1 {TMP0.16b}, [OUT], #16
482    eor TMP2.16b,TMP2.16b,BLK2.16b
483    eor BLK2.16b,IN2.16b,TWK2.16b
484    eor TMP3.16b,TMP3.16b,BLK3.16b
485    eor BLK3.16b,IN3.16b,TWK3.16b
486    st1 {TMP1.16b}, [OUT], #16
487    eor TMP4.16b,TMP4.16b,BLK4.16b
488    eor BLK4.16b,IN4.16b,TWK4.16b
489    st1 {TMP2.16b}, [OUT], #16
490    sub TROUNDS,ROUNDS,#2
491    st1 {TMP3.16b,TMP4.16b}, [OUT], #32
492
493    b.hs .Lxts_rounds_5blks
494    add LTMP,LTMP,#80      // add 5 blocks length back if LTMP < 0
495    cbz LTMP,.Lxtx_tail_blk
496    cmp LTMP, #16
497    b.eq .Lxts_pre_last_1blks
498    cmp LTMP,#32
499    b.eq .Lxts_pre_last_2blks
500    cmp LTMP,#48
501    b.eq .Lxts_pre_last_3blks
502    cmp LTMP,#64
503    b.eq .Lxts_pre_last_4blks
504.Lxts_pre_last_1blks:
505    eor IN0.16b,IN0.16b,IN4.16b   //in0 = in0 ^ in41
506    eor BLK0.16b,BLK0.16b,IN0.16b   // blk0 =  in0 ^ twk0 ^ in0 ^ in4
507    fmov TWX0,TWKD00  // reset already computed tweak
508    fmov TWX1,TWKD01
509    b .Lxts_rounds_1blks
510.Lxts_pre_last_2blks:
511    eor BLK0.16b,BLK0.16b,IN0.16b
512    eor BLK1.16b,BLK1.16b,IN1.16b
513    eor BLK0.16b,BLK0.16b,IN3.16b  // in3 -> blk0
514    eor BLK1.16b,BLK1.16b,IN4.16b  // in4 -> blk1
515    fmov TWX0,TWKD10  // reset already computed tweak
516    fmov TWX1,TWKD11
517    b .Lxts_rounds_2blks
518.Lxts_pre_last_3blks:
519    eor BLK0.16b,BLK0.16b,IN0.16b
520    eor BLK1.16b,BLK1.16b,IN1.16b
521    eor BLK2.16b,BLK2.16b,IN2.16b
522    eor BLK0.16b,BLK0.16b,IN2.16b  // in2 -> blk0
523    eor BLK1.16b,BLK1.16b,IN3.16b  // in3 -> blk1
524    eor BLK2.16b,BLK2.16b,IN4.16b  // in4 -> blk2
525    fmov TWX0,TWKD20  // reset already computed tweak
526    fmov TWX1,TWKD21
527    b .Lxts_rounds_3blks
528.Lxts_pre_last_4blks:
529    eor BLK0.16b,BLK0.16b,IN0.16b
530    eor BLK1.16b,BLK1.16b,IN1.16b
531    eor BLK2.16b,BLK2.16b,IN2.16b
532    eor BLK3.16b,BLK3.16b,IN3.16b
533    sub IN,IN,#16 // have loaded 4blks, using 3blks to process, so step back 1blk here
534    eor BLK0.16b,BLK0.16b,IN1.16b  // in1 -> blk0
535    eor BLK1.16b,BLK1.16b,IN2.16b  // in2 -> blk1
536    eor BLK2.16b,BLK2.16b,IN3.16b  // in3 -> blk2
537    eor BLK3.16b,BLK3.16b,IN4.16b  // in4 -> blk3
538    fmov TWX0,TWKD20  // reset already computed tweak
539    fmov TWX1,TWKD21
540    b .Lxts_rounds_3blks
541
542.Lxts_aesenc_finish:
543    MOV_REG_TO_VEC(TWX0,TWX1,TWKD00,TWKD01)
544    st1 {TWK0.16b}, [TWEAK]
545
546    mov x0, #0  // return value ? no need
547
548    ldp d14, d15, [sp,#64]
549    ldp d12, d13, [sp, #48]
550    ldp d10, d11, [sp, #32]
551    ldp d8, d9, [sp, #16]
552    ldp x29, x30, [sp], #80
553
554AARCH64_AUTIASP
555    ret
556.size CRYPT_AES_XTS_Encrypt, .-CRYPT_AES_XTS_Encrypt
557
558
559/**
560 * int32_t CRYPT_AES_XTS_Decrypt(const CRYPT_AES_Key *ctx, const uint8_t *in, uint8_t *out, uint32_t len, const uint8_t *t);
561 */
562.globl CRYPT_AES_XTS_Decrypt
563.type CRYPT_AES_XTS_Decrypt, %function
564.align    4
565CRYPT_AES_XTS_Decrypt:
566AARCH64_PACIASP
567    stp x29, x30, [sp,#-80]!
568    add x29, sp, #0
569    stp d8, d9, [sp,#16]
570    stp d10, d11, [sp,#32]
571    stp d12, d13, [sp,#48]
572    stp d14, d15, [sp,#64]
573
574    ld1 {TWK0.16b}, [TWEAK]
575    and LTMP, LEN, #-16
576    ands TAILNUM, LEN, #0xF   // get tail num, LEN % 16
577    sub XTMP1,LTMP,#16      // preserve last and tail block
578    csel LTMP,XTMP1,LTMP,ne  // if tailnum != 0, len -= 16
579
580    mov WTMP0,0x87
581    ldr ROUNDS,[KEY,#240]
582    fmov TWX0,TWKD00
583    fmov TWX1,TWKD01
584
585    sub ROUNDS,ROUNDS,#6   // perload last 7 rounds key
586    add KTMP,KEY,XROUNDS,lsl#4
587    ld1 {RDK2.4s,RDK3.4s},[KTMP],#32
588    ld1 {RDK4.4s,RDK5.4s},[KTMP],#32
589    ld1 {RDK6.4s,RDK7.4s},[KTMP],#32
590    ld1 {RDK8.4s},[KTMP]
591
592.Lxts_aesdec_start:
593    cmp LTMP, #80
594    b.gt .Lxts_dec_proc_5_blks
595    cmp LTMP, #48
596    b.ge .Lxts_dec_proc_3_blks
597    cmp LTMP, #32
598    b.eq .Lxts_dec_proc_2_blks
599    cmp LTMP, #16
600    b.eq .Lxts_dec_proc_1blk
601    cmp LTMP, #0
602    b.eq .Lxts_dec_last_secondblk
603.Lxtx_dec_tail_blk:
604    fmov TWX0,TWKD00  // reset already computed tweak
605    fmov TWX1,TWKD01
606    cbz TAILNUM,.Lxts_aesdec_finish
607    // prepare encrypt tail block
608    sub TMPOUT,OUT,#16
609.Lxtx_dec_tail_blk_loop:
610    subs TAILNUM,TAILNUM,1
611    ldrb WC,[TMPOUT,TAILNUM]
612    ldrb WP,[IN,TAILNUM]
613    strb WC,[OUT,TAILNUM]
614    strb WP,[TMPOUT,TAILNUM]
615    b.gt .Lxtx_dec_tail_blk_loop
616    ld1 {BLK0.16b}, [TMPOUT]
617    mov OUT,TMPOUT
618    mov TWK0.16b,TWK1.16b  // load pre-tweak back
619    b .Lxts_dec_proc_1blk_loaded
620
621    cbz LTMP,.Lxts_aesdec_finish
622
623.Lxts_dec_last_secondblk:
624    cbz TAILNUM,.Lxts_aesdec_finish
625    mov TWK1.16b,TWK0.16b   // save last second tweak
626    NextTweak TWX0,TWX1,TWKD00,TWKD01
627.Lxts_dec_proc_1blk:
628    ld1 {BLK0.16b}, [IN],#16
629.Lxts_dec_proc_1blk_loaded:
630    mov KTMP, KEY
631    eor BLK0.16b,BLK0.16b,TWK0.16b
632    ld1 {RDK0.4s},[KTMP],#16
633    sub TROUNDS,ROUNDS,#2
634    ld1 {RDK1.4s},[KTMP],#16
635.Lxts_dec_rounds_1blks:
636    AesDecrypt1x BLK0,RDK0
637    ld1 {RDK0.4s},[KTMP],#16
638    subs TROUNDS,TROUNDS,#2
639
640    AesDecrypt1x BLK0,RDK1
641    ld1 {RDK1.4s},[KTMP],#16
642    b.gt .Lxts_dec_rounds_1blks
643
644    AesDecrypt1x BLK0,RDK0
645    AesDecrypt1x BLK0,RDK1
646
647    // last 7 rounds
648    AesDecrypt1x BLK0,RDK2
649    AesDecrypt1x BLK0,RDK3
650    AesDecrypt1x BLK0,RDK4
651    AesDecrypt1x BLK0,RDK5
652    AesDecrypt1x BLK0,RDK6
653
654    aesd BLK0.16b,RDK7.16b  // final round
655    eor BLK0.16b,BLK0.16b,RDK8.16b
656    eor BLK0.16b,BLK0.16b,TWK0.16b
657
658    st1 {BLK0.16b}, [OUT], #16
659
660    NextTweak TWX0,TWX1,TWKD00,TWKD01
661
662    subs LTMP,LTMP,#16
663    b.lt .Lxtx_dec_tail_blk
664    b.hs .Lxts_aesdec_start
665
666.Lxts_dec_proc_2_blks:
667    ld1 {BLK0.16b, BLK1.16b}, [IN], #32
668    mov KTMP, KEY
669    NextTweak TWX0,TWX1,TWKD10,TWKD11
670    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
671    sub TROUNDS,ROUNDS,#2
672    eor BLK0.16b, BLK0.16b, TWK0.16b
673    eor BLK1.16b, BLK1.16b, TWK1.16b
674.Lxts_dec_rounds_2blks:
675    AesDecrypt1x BLK0,RDK0
676    AesDecrypt1x BLK1,RDK0
677    ld1 {RDK0.4s},[KTMP],#16
678    subs TROUNDS,TROUNDS,#2
679
680    AesDecrypt1x BLK0,RDK1
681    AesDecrypt1x BLK1,RDK1
682    ld1 {RDK1.4s},[KTMP],#16
683    b.gt .Lxts_dec_rounds_2blks
684
685    AesDecrypt1x BLK0,RDK0
686    AesDecrypt1x BLK1,RDK0
687
688    AesDecrypt1x BLK0,RDK1
689    AesDecrypt1x BLK1,RDK1
690
691    // last 7 rounds
692    AesDecrypt1x BLK0,RDK2
693    AesDecrypt1x BLK1,RDK2
694
695    AesDecrypt1x BLK0,RDK3
696    AesDecrypt1x BLK1,RDK3
697
698    AesDecrypt1x BLK0,RDK4
699    AesDecrypt1x BLK1,RDK4
700
701    AesDecrypt1x BLK0,RDK5
702    AesDecrypt1x BLK1,RDK5
703
704    AesDecrypt1x BLK0,RDK6
705    AesDecrypt1x BLK1,RDK6
706
707    eor TWK0.16b,TWK0.16b,RDK8.16b
708    eor TWK1.16b,TWK1.16b,RDK8.16b
709
710    aesd BLK0.16b,RDK7.16b  // final round
711    aesd BLK1.16b,RDK7.16b
712
713    eor BLK0.16b,BLK0.16b,TWK0.16b
714    eor BLK1.16b,BLK1.16b,TWK1.16b
715
716    st1 {BLK0.16b, BLK1.16b}, [OUT], #32
717    NextTweak TWX0,TWX1,TWKD00,TWKD01
718    subs LTMP,LTMP,#32
719    b.hs .Lxts_aesdec_start
720
721.Lxts_dec_proc_3_blks:
722    ld1 {BLK0.16b}, [IN], #16   // first block
723    NextTweak TWX0,TWX1,TWKD10,TWKD11
724    eor BLK0.16b,BLK0.16b,TWK0.16b
725
726    ld1 {BLK1.16b}, [IN], #16   // second block
727    NextTweak TWX0,TWX1,TWKD20,TWKD21
728    eor BLK1.16b,BLK1.16b,TWK1.16b
729
730    ld1 {BLK2.16b}, [IN], #16   // third block
731    eor BLK2.16b,BLK2.16b,TWK2.16b
732
733    mov KTMP, KEY
734    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
735    sub TROUNDS,ROUNDS,#2
736
737.Lxts_dec_rounds_3blks:
738    AesDecrypt1x BLK0,RDK0
739    AesDecrypt1x BLK1,RDK0
740    AesDecrypt1x BLK2,RDK0
741    ld1 {RDK0.4s},[KTMP],#16
742    subs TROUNDS,TROUNDS,#2
743
744    AesDecrypt1x BLK0,RDK1
745    AesDecrypt1x BLK1,RDK1
746    AesDecrypt1x BLK2,RDK1
747    ld1 {RDK1.4s},[KTMP],#16
748    b.gt .Lxts_dec_rounds_3blks
749
750    AesDecrypt1x BLK0,RDK0
751    AesDecrypt1x BLK1,RDK0
752    AesDecrypt1x BLK2,RDK0
753
754    AesDecrypt1x BLK0,RDK1
755    AesDecrypt1x BLK1,RDK1
756    AesDecrypt1x BLK2,RDK1
757
758    // last 7 rounds
759    AesDecrypt1x BLK0,RDK2
760    AesDecrypt1x BLK1,RDK2
761    AesDecrypt1x BLK2,RDK2
762
763    AesDecrypt1x BLK0,RDK3
764    AesDecrypt1x BLK1,RDK3
765    AesDecrypt1x BLK2,RDK3
766
767    AesDecrypt1x BLK0,RDK4
768    AesDecrypt1x BLK1,RDK4
769    AesDecrypt1x BLK2,RDK4
770
771    AesDecrypt1x BLK0,RDK5
772    AesDecrypt1x BLK1,RDK5
773    AesDecrypt1x BLK2,RDK5
774
775    AesDecrypt1x BLK0,RDK6
776    AesDecrypt1x BLK1,RDK6
777    AesDecrypt1x BLK2,RDK6
778
779    eor TWK0.16b,TWK0.16b,RDK8.16b
780    eor TWK1.16b,TWK1.16b,RDK8.16b
781    eor TWK2.16b,TWK2.16b,RDK8.16b
782
783    aesd BLK0.16b,RDK7.16b
784    aesd BLK1.16b,RDK7.16b
785    aesd BLK2.16b,RDK7.16b
786
787    eor BLK0.16b,BLK0.16b,TWK0.16b
788    eor BLK1.16b,BLK1.16b,TWK1.16b
789    eor BLK2.16b,BLK2.16b,TWK2.16b
790
791    st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [OUT], #48
792
793    NextTweak TWX0,TWX1,TWKD00,TWKD01
794
795    subs LTMP,LTMP,#48
796    b.hs .Lxts_aesdec_start
797
798.align 4
799.Lxts_dec_proc_5_blks:
800    ld1 {BLK0.16b}, [IN], #16   // first block
801    NextTweak TWX0,TWX1,TWKD10,TWKD11
802    eor BLK0.16b,BLK0.16b,TWK0.16b
803
804    ld1 {BLK1.16b}, [IN], #16   // second block
805    NextTweak TWX0,TWX1,TWKD20,TWKD21
806    eor BLK1.16b,BLK1.16b,TWK1.16b
807    sub LTMP,LTMP,#32
808
809    ld1 {BLK2.16b}, [IN], #16   // third block
810    NextTweak TWX0,TWX1,TWKD30,TWKD31
811    eor BLK2.16b,BLK2.16b,TWK2.16b
812
813    ld1 {BLK3.16b}, [IN], #16   // fourth block
814    NextTweak TWX0,TWX1,TWKD40,TWKD41
815    eor BLK3.16b,BLK3.16b,TWK3.16b
816    sub LTMP,LTMP,#32
817
818    ld1 {BLK4.16b}, [IN], #16   // fifth block
819    eor BLK4.16b, BLK4.16b, TWK4.16b
820    sub LTMP,LTMP,#16
821
822    mov KTMP, KEY
823    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
824    sub TROUNDS,ROUNDS,#2
825.align 4
826.Lxts_dec_rounds_5blks:
827    AesDecrypt1x BLK0,RDK0
828    AesDecrypt1x BLK1,RDK0
829    AesDecrypt1x BLK2,RDK0
830    AesDecrypt1x BLK3,RDK0
831    AesDecrypt1x BLK4,RDK0
832    ld1 {RDK0.4s},[KTMP],#16
833    subs TROUNDS,TROUNDS,#2
834
835    AesDecrypt1x BLK0,RDK1
836    AesDecrypt1x BLK1,RDK1
837    AesDecrypt1x BLK2,RDK1
838    AesDecrypt1x BLK3,RDK1
839    AesDecrypt1x BLK4,RDK1
840    ld1 {RDK1.4s},[KTMP],#16
841    b.gt .Lxts_dec_rounds_5blks
842
843    AesDecrypt1x BLK0,RDK0
844    AesDecrypt1x BLK1,RDK0
845    AesDecrypt1x BLK2,RDK0
846    AesDecrypt1x BLK3,RDK0
847    AesDecrypt1x BLK4,RDK0
848    subs LTMP,LTMP,#80
849
850    AesDecrypt1x BLK0,RDK1
851    AesDecrypt1x BLK1,RDK1
852    AesDecrypt1x BLK2,RDK1
853    AesDecrypt1x BLK3,RDK1
854    AesDecrypt1x BLK4,RDK1
855
856    // last 7 rounds
857    AesDecrypt1x BLK0,RDK2
858    AesDecrypt1x BLK1,RDK2
859    AesDecrypt1x BLK2,RDK2
860    AesDecrypt1x BLK3,RDK2
861    AesDecrypt1x BLK4,RDK2
862    csel POS,xzr,LTMP,gt  //
863
864    AesDecrypt1x BLK0,RDK3
865    AesDecrypt1x BLK1,RDK3
866    AesDecrypt1x BLK2,RDK3
867    AesDecrypt1x BLK3,RDK3
868    AesDecrypt1x BLK4,RDK3
869    add IN,IN,POS
870
871    AesDecrypt1x BLK0,RDK4
872    AesDecrypt1x BLK1,RDK4
873    AesDecrypt1x BLK2,RDK4
874    AesDecrypt1x BLK3,RDK4
875    AesDecrypt1x BLK4,RDK4
876
877    AesDecrypt1x BLK0,RDK5
878    AesDecrypt1x BLK1,RDK5
879    AesDecrypt1x BLK2,RDK5
880    AesDecrypt1x BLK3,RDK5
881    AesDecrypt1x BLK4,RDK5
882
883    AesDecrypt1x BLK0,RDK6
884    AesDecrypt1x BLK1,RDK6
885    AesDecrypt1x BLK2,RDK6
886    AesDecrypt1x BLK3,RDK6
887    AesDecrypt1x BLK4,RDK6
888
889    eor TMP0.16b,TWK0.16b,RDK8.16b
890    aesd BLK0.16b,RDK7.16b  // final round
891    NextTweak TWX0,TWX1,TWKD00,TWKD01  // perform operations of next 5blks in advance
892
893    eor TMP1.16b,TWK1.16b,RDK8.16b
894    ld1 {IN0.16b}, [IN], #16
895    aesd BLK1.16b,RDK7.16b
896    NextTweak TWX0,TWX1,TWKD10,TWKD11
897
898    eor TMP2.16b,TWK2.16b,RDK8.16b
899    ld1 {IN1.16b}, [IN], #16
900    aesd BLK2.16b,RDK7.16b
901    NextTweak TWX0,TWX1,TWKD20,TWKD21
902
903    eor TMP3.16b,TWK3.16b,RDK8.16b
904    ld1 {IN2.16b}, [IN], #16
905    aesd BLK3.16b,RDK7.16b
906    NextTweak TWX0,TWX1,TWKD30,TWKD31
907
908    eor TMP4.16b,TWK4.16b,RDK8.16b
909    ld1 {IN3.16b}, [IN], #16
910    aesd BLK4.16b,RDK7.16b
911    NextTweak TWX0,TWX1,TWKD40,TWKD41
912
913    ld1 {IN4.16b}, [IN], #16
914    mov KTMP, KEY
915    ld1 {RDK0.4s,RDK1.4s},[KTMP],#32
916    eor TMP0.16b,TMP0.16b,BLK0.16b
917    eor BLK0.16b,IN0.16b,TWK0.16b  // blk0 = in0 ^ twk0
918    eor TMP1.16b,TMP1.16b,BLK1.16b
919    eor BLK1.16b,IN1.16b,TWK1.16b
920    st1 {TMP0.16b}, [OUT], #16
921    eor TMP2.16b,TMP2.16b,BLK2.16b
922    eor BLK2.16b,IN2.16b,TWK2.16b
923    eor TMP3.16b,TMP3.16b,BLK3.16b
924    eor BLK3.16b,IN3.16b,TWK3.16b
925    st1 {TMP1.16b}, [OUT], #16
926    eor TMP4.16b,TMP4.16b,BLK4.16b
927    eor BLK4.16b,IN4.16b,TWK4.16b
928    st1 {TMP2.16b}, [OUT], #16
929    sub TROUNDS,ROUNDS,#2
930    st1 {TMP3.16b,TMP4.16b}, [OUT], #32
931
932    b.hs .Lxts_dec_rounds_5blks
933    add LTMP,LTMP,#80      // add 5 blocks length back if LTMP < 0
934    cbz LTMP,.Lxtx_dec_tail_blk
935    cmp LTMP, #16
936    b.eq .Lxts_dec_pre_last_1blks
937    cmp LTMP,#32
938    b.eq .Lxts_dec_pre_last_2blks
939    cmp LTMP,#48
940    b.eq .Lxts_dec_pre_last_3blks
941    cmp LTMP,#64
942    b.eq .Lxts_dec_pre_last_4blks
943.Lxts_dec_pre_last_1blks:
944    eor IN0.16b,IN0.16b,IN4.16b   //in0 = in0 ^ in41
945    eor BLK0.16b,BLK0.16b,IN0.16b   // blk0 =  in0 ^ twk0 ^ in0 ^ in4
946    fmov TWX0,TWKD00  // reset already computed tweak
947    fmov TWX1,TWKD01
948    b .Lxts_dec_rounds_1blks
949.Lxts_dec_pre_last_2blks:
950    eor BLK0.16b,BLK0.16b,IN0.16b
951    eor BLK1.16b,BLK1.16b,IN1.16b
952    eor BLK0.16b,BLK0.16b,IN3.16b  // in3 -> blk0
953    eor BLK1.16b,BLK1.16b,IN4.16b  // in4 -> blk1
954    fmov TWX0,TWKD10  // reset already computed tweak
955    fmov TWX1,TWKD11
956    b .Lxts_dec_rounds_2blks
957.Lxts_dec_pre_last_3blks:
958    eor BLK0.16b,BLK0.16b,IN0.16b
959    eor BLK1.16b,BLK1.16b,IN1.16b
960    eor BLK2.16b,BLK2.16b,IN2.16b
961    eor BLK0.16b,BLK0.16b,IN2.16b  // in2 -> blk0
962    eor BLK1.16b,BLK1.16b,IN3.16b  // in3 -> blk1
963    eor BLK2.16b,BLK2.16b,IN4.16b  // in4 -> blk2
964    fmov TWX0,TWKD20  // reset already computed tweak
965    fmov TWX1,TWKD21
966    b .Lxts_dec_rounds_3blks
967.Lxts_dec_pre_last_4blks:
968    eor BLK0.16b,BLK0.16b,IN0.16b
969    eor BLK1.16b,BLK1.16b,IN1.16b
970    eor BLK2.16b,BLK2.16b,IN2.16b
971    eor BLK3.16b,BLK3.16b,IN3.16b
972    sub IN,IN,#16 // have loaded 4blks, using 3blks to process, so step back 1blk here
973    eor BLK0.16b,BLK0.16b,IN1.16b  // in1 -> blk0
974    eor BLK1.16b,BLK1.16b,IN2.16b  // in2 -> blk1
975    eor BLK2.16b,BLK2.16b,IN3.16b  // in3 -> blk2
976    eor BLK3.16b,BLK3.16b,IN4.16b  // in4 -> blk3
977    fmov TWX0,TWKD20  // reset already computed tweak
978    fmov TWX1,TWKD21
979    b .Lxts_dec_rounds_3blks
980
981.Lxts_aesdec_finish:
982    MOV_REG_TO_VEC(TWX0,TWX1,TWKD00,TWKD01)
983    st1 {TWK0.16b}, [TWEAK]
984
985    mov x0, #0
986
987    ldp d14, d15, [sp,#64]
988    ldp d12, d13, [sp, #48]
989    ldp d10, d11, [sp, #32]
990    ldp d8, d9, [sp, #16]
991    ldp x29, x30, [sp], #80
992AARCH64_AUTIASP
993    ret
994.size CRYPT_AES_XTS_Decrypt, .-CRYPT_AES_XTS_Decrypt
995
996#endif
997