• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
2; 2021-12-25 : Igor Pavlov : Public domain
3
4include 7zAsm.asm
5
6ifdef __ASMC__
7  use_vaes_256 equ 1
8else
9ifdef ymm0
10  use_vaes_256 equ 1
11endif
12endif
13
14
15ifdef use_vaes_256
16  ECHO "++ VAES 256"
17else
18  ECHO "-- NO VAES 256"
19endif
20
21ifdef x64
22  ECHO "x86-64"
23else
24  ECHO "x86"
25if (IS_CDECL gt 0)
26  ECHO "ABI : CDECL"
27else
28  ECHO "ABI : no CDECL : FASTCALL"
29endif
30endif
31
32if (IS_LINUX gt 0)
33  ECHO "ABI : LINUX"
34else
35  ECHO "ABI : WINDOWS"
36endif
37
38MY_ASM_START
39
40ifndef x64
41    .686
42    .xmm
43endif
44
45
46; MY_ALIGN EQU ALIGN(64)
47MY_ALIGN EQU
48
49SEG_ALIGN EQU MY_ALIGN
50
51MY_SEG_PROC macro name:req, numParams:req
52    ; seg_name equ @CatStr(_TEXT$, name)
53    ; seg_name SEGMENT SEG_ALIGN 'CODE'
54    MY_PROC name, numParams
55endm
56
57MY_SEG_ENDP macro
58    ; seg_name ENDS
59endm
60
61
62NUM_AES_KEYS_MAX equ 15
63
64; the number of push operators in function PROLOG
65if (IS_LINUX eq 0) or (IS_X64 eq 0)
66num_regs_push   equ 2
67stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
68endif
69
70ifdef x64
71    num_param   equ REG_ABI_PARAM_2
72else
73  if (IS_CDECL gt 0)
74    ;   size_t     size
75    ;   void *     data
76    ;   UInt32 *   aes
77    ;   ret-ip <- (r4)
78    aes_OFFS    equ (stack_param_offset)
79    data_OFFS   equ (REG_SIZE + aes_OFFS)
80    size_OFFS   equ (REG_SIZE + data_OFFS)
81    num_param   equ [r4 + size_OFFS]
82  else
83    num_param   equ [r4 + stack_param_offset]
84  endif
85endif
86
87keys    equ  REG_PARAM_0  ; r1
88rD      equ  REG_PARAM_1  ; r2
89rN      equ  r0
90
91koffs_x equ  x7
92koffs_r equ  r7
93
94ksize_x equ  x6
95ksize_r equ  r6
96
97keys2   equ  r3
98
99state   equ  xmm0
100key     equ  xmm0
101key_ymm equ  ymm0
102key_ymm_n equ   0
103
104ifdef x64
105        ways = 11
106else
107        ways = 4
108endif
109
110ways_start_reg equ 1
111
112iv      equ     @CatStr(xmm, %(ways_start_reg + ways))
113iv_ymm  equ     @CatStr(ymm, %(ways_start_reg + ways))
114
115
116WOP macro op, op2
117    i = 0
118    rept ways
119        op      @CatStr(xmm, %(ways_start_reg + i)), op2
120        i = i + 1
121    endm
122endm
123
124
125ifndef ABI_LINUX
126ifdef x64
127
128; we use 32 bytes of home space in stack in WIN64-x64
129NUM_HOME_MM_REGS   equ (32 / 16)
130; we preserve xmm registers starting from xmm6 in WIN64-x64
131MM_START_SAVE_REG  equ 6
132
133SAVE_XMM macro num_used_mm_regs:req
134  num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
135  if num_save_mm_regs GT 0
136    num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
137    ; RSP is (16*x + 8) after entering the function in WIN64-x64
138    stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
139
140    i = 0
141    rept num_save_mm_regs
142
143      if i eq NUM_HOME_MM_REGS
144        sub  r4, stack_offset
145      endif
146
147      if i lt NUM_HOME_MM_REGS
148        movdqa  [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
149      else
150        movdqa  [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
151      endif
152
153      i = i + 1
154    endm
155  endif
156endm
157
158RESTORE_XMM macro num_used_mm_regs:req
159  if num_save_mm_regs GT 0
160    i = 0
161    if num_save_mm_regs2 GT 0
162      rept num_save_mm_regs2
163        movdqa  @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
164        i = i + 1
165      endm
166        add     r4, stack_offset
167    endif
168
169    num_low_regs = num_save_mm_regs - i
170    i = 0
171      rept num_low_regs
172        movdqa  @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
173        i = i + 1
174      endm
175  endif
176endm
177
178endif ; x64
179endif ; ABI_LINUX
180
181
182MY_PROLOG macro num_used_mm_regs:req
183        ; num_regs_push: must be equal to the number of push operators
184        ; push    r3
185        ; push    r5
186    if (IS_LINUX eq 0) or (IS_X64 eq 0)
187        push    r6
188        push    r7
189    endif
190
191        mov     rN, num_param  ; don't move it; num_param can use stack pointer (r4)
192
193    if (IS_X64 eq 0)
194      if (IS_CDECL gt 0)
195        mov     rD,   [r4 + data_OFFS]
196        mov     keys, [r4 + aes_OFFS]
197      endif
198    elseif (IS_LINUX gt 0)
199        MY_ABI_LINUX_TO_WIN_2
200    endif
201
202
203    ifndef ABI_LINUX
204    ifdef x64
205        SAVE_XMM num_used_mm_regs
206    endif
207    endif
208
209        mov     ksize_x, [keys + 16]
210        shl     ksize_x, 5
211endm
212
213
214MY_EPILOG macro
215    ifndef ABI_LINUX
216    ifdef x64
217        RESTORE_XMM num_save_mm_regs
218    endif
219    endif
220
221    if (IS_LINUX eq 0) or (IS_X64 eq 0)
222        pop     r7
223        pop     r6
224    endif
225        ; pop     r5
226        ; pop     r3
227    MY_ENDP
228endm
229
230
231OP_KEY macro op:req, offs:req
232        op      state, [keys + offs]
233endm
234
235
236WOP_KEY macro op:req, offs:req
237        movdqa  key, [keys + offs]
238        WOP     op, key
239endm
240
241
242; ---------- AES-CBC Decode ----------
243
244
245XOR_WITH_DATA macro reg, _ppp_
246        pxor    reg, [rD + i * 16]
247endm
248
249WRITE_TO_DATA macro reg, _ppp_
250        movdqa  [rD + i * 16], reg
251endm
252
253
254; state0    equ  @CatStr(xmm, %(ways_start_reg))
255
256key0            equ  @CatStr(xmm, %(ways_start_reg + ways + 1))
257key0_ymm        equ  @CatStr(ymm, %(ways_start_reg + ways + 1))
258
259key_last        equ  @CatStr(xmm, %(ways_start_reg + ways + 2))
260key_last_ymm    equ  @CatStr(ymm, %(ways_start_reg + ways + 2))
261key_last_ymm_n  equ                (ways_start_reg + ways + 2)
262
263NUM_CBC_REGS    equ  (ways_start_reg + ways + 3)
264
265
266MY_SEG_PROC AesCbc_Decode_HW, 3
267
268    AesCbc_Decode_HW_start::
269        MY_PROLOG NUM_CBC_REGS
270
271    AesCbc_Decode_HW_start_2::
272        movdqa  iv, [keys]
273        add     keys, 32
274
275        movdqa  key0, [keys + 1 * ksize_r]
276        movdqa  key_last, [keys]
277        sub     ksize_x, 16
278
279        jmp     check2
280    align 16
281    nextBlocks2:
282        WOP     movdqa, [rD + i * 16]
283        mov     koffs_x, ksize_x
284        ; WOP_KEY pxor, ksize_r + 16
285        WOP     pxor, key0
286    ; align 16
287    @@:
288        WOP_KEY aesdec, 1 * koffs_r
289        sub     koffs_r, 16
290        jnz     @B
291        ; WOP_KEY aesdeclast, 0
292        WOP     aesdeclast, key_last
293
294        pxor    @CatStr(xmm, %(ways_start_reg)), iv
295    i = 1
296    rept ways - 1
297        pxor    @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
298        i = i + 1
299    endm
300        movdqa  iv, [rD + ways * 16 - 16]
301        WOP     WRITE_TO_DATA
302
303        add     rD, ways * 16
304    AesCbc_Decode_HW_start_3::
305    check2:
306        sub     rN, ways
307        jnc     nextBlocks2
308        add     rN, ways
309
310        sub     ksize_x, 16
311
312        jmp     check
313    nextBlock:
314        movdqa  state, [rD]
315        mov     koffs_x, ksize_x
316        ; OP_KEY  pxor, 1 * ksize_r + 32
317        pxor    state, key0
318        ; movdqa  state0, [rD]
319        ; movdqa  state, key0
320        ; pxor    state, state0
321    @@:
322        OP_KEY  aesdec, 1 * koffs_r + 16
323        OP_KEY  aesdec, 1 * koffs_r
324        sub     koffs_r, 32
325        jnz     @B
326        OP_KEY  aesdec, 16
327        ; OP_KEY  aesdeclast, 0
328        aesdeclast state, key_last
329
330        pxor    state, iv
331        movdqa  iv, [rD]
332        ; movdqa  iv, state0
333        movdqa  [rD], state
334
335        add     rD, 16
336    check:
337        sub     rN, 1
338        jnc     nextBlock
339
340        movdqa  [keys - 32], iv
341MY_EPILOG
342
343
344
345
346; ---------- AVX ----------
347
348
349AVX__WOP_n macro op
350    i = 0
351    rept ways
352        op      (ways_start_reg + i)
353        i = i + 1
354    endm
355endm
356
357AVX__WOP macro op
358    i = 0
359    rept ways
360        op      @CatStr(ymm, %(ways_start_reg + i))
361        i = i + 1
362    endm
363endm
364
365
366AVX__WOP_KEY macro op:req, offs:req
367        vmovdqa  key_ymm, ymmword ptr [keys2 + offs]
368        AVX__WOP_n op
369endm
370
371
372AVX__CBC_START macro reg
373        ; vpxor   reg, key_ymm, ymmword ptr [rD + 32 * i]
374        vpxor   reg, key0_ymm, ymmword ptr [rD + 32 * i]
375endm
376
377AVX__CBC_END macro reg
378    if i eq 0
379        vpxor   reg, reg, iv_ymm
380    else
381        vpxor   reg, reg, ymmword ptr [rD + i * 32 - 16]
382    endif
383endm
384
385
386AVX__WRITE_TO_DATA macro reg
387        vmovdqu ymmword ptr [rD + 32 * i], reg
388endm
389
390AVX__XOR_WITH_DATA macro reg
391        vpxor   reg, reg, ymmword ptr [rD + 32 * i]
392endm
393
394AVX__CTR_START macro reg
395        vpaddq  iv_ymm, iv_ymm, one_ymm
396        ; vpxor   reg, iv_ymm, key_ymm
397        vpxor   reg, iv_ymm, key0_ymm
398endm
399
400
401MY_VAES_INSTR_2 macro cmd, dest, a1, a2
402  db 0c4H
403  db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
404  db 5 + 8 * ((not (a1)) and 15)
405  db cmd
406  db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
407endm
408
409MY_VAES_INSTR macro cmd, dest, a
410        MY_VAES_INSTR_2  cmd, dest, dest, a
411endm
412
413MY_vaesenc macro dest, a
414        MY_VAES_INSTR  0dcH, dest, a
415endm
416MY_vaesenclast macro dest, a
417        MY_VAES_INSTR  0ddH, dest, a
418endm
419MY_vaesdec macro dest, a
420        MY_VAES_INSTR  0deH, dest, a
421endm
422MY_vaesdeclast macro dest, a
423        MY_VAES_INSTR  0dfH, dest, a
424endm
425
426
427AVX__VAES_DEC macro reg
428        MY_vaesdec reg, key_ymm_n
429endm
430
431AVX__VAES_DEC_LAST_key_last macro reg
432        ; MY_vaesdeclast reg, key_ymm_n
433        MY_vaesdeclast reg, key_last_ymm_n
434endm
435
436AVX__VAES_ENC macro reg
437        MY_vaesenc reg, key_ymm_n
438endm
439
440AVX__VAES_ENC_LAST macro reg
441        MY_vaesenclast reg, key_ymm_n
442endm
443
444AVX__vinserti128_TO_HIGH macro dest, src
445        vinserti128  dest, dest, src, 1
446endm
447
448
449MY_PROC AesCbc_Decode_HW_256, 3
450  ifdef use_vaes_256
451        MY_PROLOG NUM_CBC_REGS
452
453        cmp    rN, ways * 2
454        jb     AesCbc_Decode_HW_start_2
455
456        vmovdqa iv, xmmword ptr [keys]
457        add     keys, 32
458
459        vbroadcasti128  key0_ymm, xmmword ptr [keys + 1 * ksize_r]
460        vbroadcasti128  key_last_ymm, xmmword ptr [keys]
461        sub     ksize_x, 16
462        mov     koffs_x, ksize_x
463        add     ksize_x, ksize_x
464
465        AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
466        push    keys2
467        sub     r4, AVX_STACK_SUB
468        ; sub     r4, 32
469        ; sub     r4, ksize_r
470        ; lea     keys2, [r4 + 32]
471        mov     keys2, r4
472        and     keys2, -32
473    broad:
474        vbroadcasti128  key_ymm, xmmword ptr [keys + 1 * koffs_r]
475        vmovdqa         ymmword ptr [keys2 + koffs_r * 2], key_ymm
476        sub     koffs_r, 16
477        ; jnc     broad
478        jnz     broad
479
480        sub     rN, ways * 2
481
482    align 16
483    avx_cbcdec_nextBlock2:
484        mov     koffs_x, ksize_x
485        ; AVX__WOP_KEY    AVX__CBC_START, 1 * koffs_r + 32
486        AVX__WOP    AVX__CBC_START
487    @@:
488        AVX__WOP_KEY    AVX__VAES_DEC, 1 * koffs_r
489        sub     koffs_r, 32
490        jnz     @B
491        ; AVX__WOP_KEY    AVX__VAES_DEC_LAST, 0
492        AVX__WOP_n   AVX__VAES_DEC_LAST_key_last
493
494        AVX__vinserti128_TO_HIGH  iv_ymm, xmmword ptr [rD]
495        AVX__WOP        AVX__CBC_END
496
497        vmovdqa         iv, xmmword ptr [rD + ways * 32 - 16]
498        AVX__WOP        AVX__WRITE_TO_DATA
499
500        add     rD, ways * 32
501        sub     rN, ways * 2
502        jnc     avx_cbcdec_nextBlock2
503        add     rN, ways * 2
504
505        shr     ksize_x, 1
506
507        ; lea     r4, [r4 + 1 * ksize_r + 32]
508        add     r4, AVX_STACK_SUB
509        pop     keys2
510
511        vzeroupper
512        jmp     AesCbc_Decode_HW_start_3
513  else
514        jmp     AesCbc_Decode_HW_start
515  endif
516MY_ENDP
517MY_SEG_ENDP
518
519
520
521
522; ---------- AES-CBC Encode ----------
523
524e0  equ  xmm1
525
526CENC_START_KEY     equ 2
527CENC_NUM_REG_KEYS  equ (3 * 2)
528; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
529
530MY_SEG_PROC AesCbc_Encode_HW, 3
531        MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
532
533        movdqa  state, [keys]
534        add     keys, 32
535
536    i = 0
537    rept CENC_NUM_REG_KEYS
538        movdqa  @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
539        i = i + 1
540    endm
541
542        add     keys, ksize_r
543        neg     ksize_r
544        add     ksize_r, (16 * CENC_NUM_REG_KEYS)
545        ; movdqa  last_key, [keys]
546        jmp     check_e
547
548    align 16
549    nextBlock_e:
550        movdqa  e0, [rD]
551        mov     koffs_r, ksize_r
552        pxor    e0, @CatStr(xmm, %(CENC_START_KEY))
553        pxor    state, e0
554
555    i = 1
556    rept (CENC_NUM_REG_KEYS - 1)
557        aesenc  state, @CatStr(xmm, %(CENC_START_KEY + i))
558        i = i + 1
559    endm
560
561    @@:
562        OP_KEY  aesenc, 1 * koffs_r
563        OP_KEY  aesenc, 1 * koffs_r + 16
564        add     koffs_r, 32
565        jnz     @B
566        OP_KEY  aesenclast, 0
567        ; aesenclast state, last_key
568
569        movdqa  [rD], state
570        add     rD, 16
571    check_e:
572        sub     rN, 1
573        jnc     nextBlock_e
574
575        ; movdqa  [keys - 32], state
576        movdqa  [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
577MY_EPILOG
578MY_SEG_ENDP
579
580
581
582; ---------- AES-CTR ----------
583
584ifdef x64
585        ; ways = 11
586endif
587
588
589one             equ  @CatStr(xmm, %(ways_start_reg + ways + 1))
590one_ymm         equ  @CatStr(ymm, %(ways_start_reg + ways + 1))
591key0            equ  @CatStr(xmm, %(ways_start_reg + ways + 2))
592key0_ymm        equ  @CatStr(ymm, %(ways_start_reg + ways + 2))
593NUM_CTR_REGS    equ  (ways_start_reg + ways + 3)
594
595INIT_CTR macro reg, _ppp_
596        paddq   iv, one
597        movdqa  reg, iv
598endm
599
600
601MY_SEG_PROC AesCtr_Code_HW, 3
602    Ctr_start::
603        MY_PROLOG NUM_CTR_REGS
604
605    Ctr_start_2::
606        movdqa  iv, [keys]
607        add     keys, 32
608        movdqa  key0, [keys]
609
610        add     keys, ksize_r
611        neg     ksize_r
612        add     ksize_r, 16
613
614    Ctr_start_3::
615        mov     koffs_x, 1
616        movd    one, koffs_x
617        jmp     check2_c
618
619    align 16
620    nextBlocks2_c:
621        WOP     INIT_CTR, 0
622        mov     koffs_r, ksize_r
623        ; WOP_KEY pxor, 1 * koffs_r -16
624        WOP     pxor, key0
625    @@:
626        WOP_KEY aesenc, 1 * koffs_r
627        add     koffs_r, 16
628        jnz     @B
629        WOP_KEY aesenclast, 0
630
631        WOP     XOR_WITH_DATA
632        WOP     WRITE_TO_DATA
633        add     rD, ways * 16
634    check2_c:
635        sub     rN, ways
636        jnc     nextBlocks2_c
637        add     rN, ways
638
639        sub     keys, 16
640        add     ksize_r, 16
641
642        jmp     check_c
643
644    ; align 16
645    nextBlock_c:
646        paddq   iv, one
647        ; movdqa  state, [keys + 1 * koffs_r - 16]
648        movdqa  state, key0
649        mov     koffs_r, ksize_r
650        pxor    state, iv
651
652    @@:
653        OP_KEY  aesenc, 1 * koffs_r
654        OP_KEY  aesenc, 1 * koffs_r + 16
655        add     koffs_r, 32
656        jnz     @B
657        OP_KEY  aesenc, 0
658        OP_KEY  aesenclast, 16
659
660        pxor    state, [rD]
661        movdqa  [rD], state
662        add     rD, 16
663    check_c:
664        sub     rN, 1
665        jnc     nextBlock_c
666
667        ; movdqa  [keys - 32], iv
668        movdqa  [keys + 1 * ksize_r - 16 - 32], iv
669MY_EPILOG
670
671
672MY_PROC AesCtr_Code_HW_256, 3
673  ifdef use_vaes_256
674        MY_PROLOG NUM_CTR_REGS
675
676        cmp    rN, ways * 2
677        jb     Ctr_start_2
678
679        vbroadcasti128  iv_ymm, xmmword ptr [keys]
680        add     keys, 32
681        vbroadcasti128  key0_ymm, xmmword ptr [keys]
682        mov     koffs_x, 1
683        vmovd           one, koffs_x
684        vpsubq  iv_ymm, iv_ymm, one_ymm
685        vpaddq  one, one, one
686        AVX__vinserti128_TO_HIGH     one_ymm, one
687
688        add     keys, ksize_r
689        sub     ksize_x, 16
690        neg     ksize_r
691        mov     koffs_r, ksize_r
692        add     ksize_r, ksize_r
693
694        AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
695        push    keys2
696        lea     keys2, [r4 - 32]
697        sub     r4, AVX_STACK_SUB
698        and     keys2, -32
699        vbroadcasti128  key_ymm, xmmword ptr [keys]
700        vmovdqa         ymmword ptr [keys2], key_ymm
701     @@:
702        vbroadcasti128  key_ymm, xmmword ptr [keys + 1 * koffs_r]
703        vmovdqa         ymmword ptr [keys2 + koffs_r * 2], key_ymm
704        add     koffs_r, 16
705        jnz     @B
706
707        sub     rN, ways * 2
708
709    align 16
710    avx_ctr_nextBlock2:
711        mov             koffs_r, ksize_r
712        AVX__WOP        AVX__CTR_START
713        ; AVX__WOP_KEY    AVX__CTR_START, 1 * koffs_r - 32
714    @@:
715        AVX__WOP_KEY    AVX__VAES_ENC, 1 * koffs_r
716        add     koffs_r, 32
717        jnz     @B
718        AVX__WOP_KEY    AVX__VAES_ENC_LAST, 0
719
720        AVX__WOP        AVX__XOR_WITH_DATA
721        AVX__WOP        AVX__WRITE_TO_DATA
722
723        add     rD, ways * 32
724        sub     rN, ways * 2
725        jnc     avx_ctr_nextBlock2
726        add     rN, ways * 2
727
728        vextracti128    iv, iv_ymm, 1
729        sar     ksize_r, 1
730
731        add     r4, AVX_STACK_SUB
732        pop     keys2
733
734        vzeroupper
735        jmp     Ctr_start_3
736  else
737        jmp     Ctr_start
738  endif
739MY_ENDP
740MY_SEG_ENDP
741
742end
743