• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_GCM
18
19.file   "ghash_x86_64.S"
20.text
21
22.set INL,       %xmm11
23.set INH,       %xmm12
24.set INM,       %xmm13
25.set HKEY3,     %xmm14
26.set HKEY4,     %xmm15
27
28.set INPUT_XI,  %rdi
29.set HTABLE,    %rsi
30.set INPUT_IN,  %rdx
31.set LEN,       %rcx
32.set XI_L,      %xmm0
33.set XI_H,      %xmm1
34.set HKEY,      %xmm2
35
36.set IN_L,      %xmm3
37.set IN_H,      %xmm4
38.set IN_M,      %xmm5
39.set HKEY2,     %xmm6
40.set HKEY1_2,   %xmm7
41.set TEMP1,     %xmm8
42.set TEMP2,     %xmm9
43.set MASK,      %xmm10
44
45.balign 16
46g_bswapMask:
47    .byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
48.size g_bswapMask, .-g_bswapMask
49.balign 16
50g_polynomial:
51	.byte	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc2
52.size g_polynomial, .-g_polynomial
53.balign 16
54g_64swapMask:
55    .byte   7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
56.size g_64swapMask, .-g_64swapMask
57.balign 16
58g_poly:
59    .byte   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
60    .byte   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
61.size   g_poly, .-g_poly
62
63/**
64 *  Macro description: one block * H (128bit * 128bit)
65 *  Input registers: xl, hKey, hKey12
66 *  Change registers: temp1 and temp2
67 *  Result register: xh, xl
68 */
69.macro  GHASH_MUL128X128 xh, xl, hKey, hKey12, temp1, temp2
70    vpshufd  $0x4e, \xl, \temp1
71    vpclmulqdq   $0x11, \hKey, \xl, \xh
72    vpxor    \xl, \temp1, \temp1
73
74    vpclmulqdq   $0x00, \hKey, \xl, \xl
75    vpxor    \xl, \xh, \temp2
76    vpclmulqdq   $0x00, \hKey12, \temp1, \temp1
77    vpxor    \temp2, \temp1, \temp1
78
79    vpslldq  $8, \temp1, \temp2
80    vpsrldq  $8, \temp1, \temp1
81    vpxor    \temp1, \xh, \xh
82    vpxor    \temp2, \xl, \xl
83.endm
84
85/**
86 *  Macro description: 256-bit large number reduction modulo g(x)
87 *  Input register: xh, xl
88 *  Change registers: temp1 and temp2
89 *  Result register: xl
90 */
91.macro REDUCTION_256BIT xh, xl, temp1, temp2, reducMask
92    vmovdqa \reducMask(%rip), \temp1     // g_poly
93    vpalignr $8, \xl, \xl, \temp2        // 1st phase of reduction
94    vpclmulqdq $0x10, \temp1, \xl, \xl
95    vpxor \temp2, \xl, \xl
96
97    vpalignr $8, \xl, \xl, \temp2        // 2nd phase of reduction
98    vpclmulqdq $0x10, \temp1, \xl, \xl
99    vpxor \xh, \temp2, \temp2
100    vpxor \temp2, \xl, \xl
101.endm
102
103/**
104 *  Function description: x86_64 hTable pre-computation table implementation (H has been transformed)
105 *  Function prototype: void GcmTableGen4bit(uint8_t key[GCM_BLOCKSIZE], MODES_GCM_GF128 hTable[16]);
106 *  Input register:
107 *      rdi: uint8_t key[GCM_BLOCKSIZE]
108 *      rsi: MODES_GCM_GF128 hTable[16]
109 *  Change register: xmm0-xmm15
110 *  Function/Macro Call:
111 *          GHASH_MUL128X128
112 *          REDUCTION_256BIT
113 */
114.align 32
115.globl  GcmTableGen4bit
116.type GcmTableGen4bit, %function
117GcmTableGen4bit:
118.cfi_startproc
119    vmovdqu  (INPUT_XI), HKEY
120    vpshufb  g_64swapMask(%rip), HKEY, HKEY
121    vpshufd  $0x4e, HKEY, IN_L
122    vpshufd  $0x55, HKEY, HKEY              // broadcast carry bit
123    vmovdqa  g_polynomial(%rip), IN_H
124
125    vpsrlq   $63, IN_L, IN_M
126    vpxor    MASK, MASK, MASK
127    vpcmpgtd HKEY, MASK, HKEY
128    vpand    IN_H, IN_M, IN_M
129    vpsllq   $1, IN_L, IN_L
130
131    vpshufd  $0x4e, IN_M, IN_M
132
133    vpand    HKEY, IN_H, IN_H
134    vpor     IN_M, IN_L, IN_L               // H<<<=1
135    vpxor   IN_L, IN_H, HKEY                // twisted H
136
137    vmovdqu  HKEY, (HTABLE)                 // store in H[0]
138    vpshufd  $0x4e, HKEY, HKEY1_2
139    vpxor    HKEY, HKEY1_2, HKEY1_2
140    vmovdqa  HKEY, XI_L
141    /* xh, xl, hKey, hKey12, temp1, temp2 */
142    GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2                        // calculate H^2
143    /* xh, xl, temp1, temp2, reducMask */
144    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
145    vmovdqa  XI_L, HKEY2
146    GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2                         // calculate H^3
147    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
148    vmovdqa  XI_L, HKEY3
149    GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2                         // calculate H^4
150    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
151    vmovdqa  XI_L, HKEY4
152    vmovdqu  HKEY2, 0x10(HTABLE)            // store H^2 in H[1]
153    vmovdqu  HKEY3, 0x30(HTABLE)            // store H^3 in H[3]
154    vmovdqu  HKEY4, 0x40(HTABLE)            // store H^4 in H[4]
155
156    vpshufd  $0x4e, HKEY2, TEMP1
157    vpxor    HKEY2, TEMP1, TEMP1
158    vshufps $0x44, TEMP1, HKEY1_2, HKEY1_2
159    vmovdqu  HKEY1_2, 0x20(HTABLE)          // store [H^2.h + H^2.l, H.h + H.l] in H[2]
160
161    vpshufd  $0x4e, HKEY3, TEMP1
162    vpshufd  $0x4e, HKEY4, TEMP2
163    vpxor    HKEY3, TEMP1, TEMP1
164    vpxor    HKEY4, TEMP2, TEMP2
165    vshufps $0x44, TEMP2, TEMP1, HKEY1_2
166    vmovdqu  HKEY1_2, 0x50(HTABLE)          // store [H^4.h + H^4.l, H^3.h + H^3.l] in H[5]
167
168    vmovdqu  0x20(HTABLE), HKEY1_2          // reload [H^2.h + H^2.l, H.h + H.l]
169    GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2                         // calculate H^5,  for aes-gcm
170    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
171    vmovdqa  XI_L, HKEY3
172    GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2                         // calculate H^6,  for aes-gcm
173    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
174    vmovdqa  XI_L, HKEY4
175    vmovdqu  HKEY3, 0x60(HTABLE)            // store H^5 in H[6]
176    vmovdqu  HKEY4, 0x70(HTABLE)            // store H^6 in H[7]
177    vpshufd  $0x4e, HKEY3, TEMP1
178    vpshufd  $0x4e, HKEY4, TEMP2
179    vpxor    HKEY3, TEMP1, TEMP1
180    vpxor    HKEY4, TEMP2, TEMP2
181    vshufps $0x44, TEMP2, TEMP1, HKEY1_2
182    vmovdqu  HKEY1_2, 0x80(HTABLE)          // store [H^6.h + H^6.l, H^5.h + H^5.l] in H[8]
183
184    vpxor    HKEY, HKEY, HKEY               // clear hTable
185    vpxor    HKEY1_2, HKEY1_2, HKEY1_2
186    vpxor    HKEY2, HKEY2, HKEY2
187    vpxor    HKEY3, HKEY3, HKEY3
188    vpxor    HKEY4, HKEY4, HKEY4
189    ret
190.cfi_endproc
191.size   GcmTableGen4bit, .-GcmTableGen4bit
192
193/**
194 *  Function description: x86_64 ghash assembly acceleration implementation
195 *  Function prototype: void GcmHashMultiBlock(uint8_t t[GCM_BLOCKSIZE], const MODES_GCM_GF128 hTable[16],
196 *                                             const uint8_t *in, uint32_t inLen);
197 *  Input register:
198 *        rdi: uint8_t t[GCM_BLOCKSIZE]
199 *        rsi: const MODES_GCM_GF128 hTable[16]
200 *        rdx: const uint8_t *in
201 *        rcx: uint32_t inLen
202 *  Change register: xmm0-xmm15
203 *  Function/Macro Call:
204 *          GHASH_MUL128X128
205 *          REDUCTION_256BIT      // reduction modulo g(x)
206 */
207.align	32
208.globl GcmHashMultiBlock
209.type GcmHashMultiBlock, %function
210GcmHashMultiBlock:
211.cfi_startproc
212    vmovdqa	 g_bswapMask(%rip), MASK
213    vmovdqu  (INPUT_XI), XI_L
214    vmovdqu  (HTABLE), HKEY
215    vmovdqu  0x20(HTABLE), HKEY1_2
216    vpshufb  MASK, XI_L, XI_L
217
218    cmp $0x10, LEN
219    je  .Lremain_1block
220
221    vmovdqu  0x10(HTABLE), HKEY2
222    cmp $0x40, LEN
223    jae .Lmul_4blocks
224    jmp .Lremain_Least_2blocks
225
226.align	32
227.Lmul_4blocks:
228    subq $0x40, LEN
229
230    vmovdqu  0x30(INPUT_IN), IN_L           // load In_3, In_2
231    vmovdqu  0x20(INPUT_IN), INL
232    vpshufb  MASK, IN_L, IN_L
233    vpshufb  MASK, INL, INL
234
235    vmovdqa  IN_L, IN_H                     // H * In_3
236    vpshufd  $0x4e, IN_L, IN_M
237    vpxor    IN_L, IN_M, IN_M
238    vpclmulqdq   $0x00, HKEY, IN_L, IN_L
239    vpclmulqdq   $0x11, HKEY, IN_H, IN_H
240    vpclmulqdq   $0x00, HKEY1_2, IN_M, IN_M
241
242    vmovdqa  INL, INH                       // H^2 * In_2
243    vpshufd  $0x4e, INL, INM
244    vpxor    INL, INM, INM
245    vpclmulqdq   $0x00, HKEY2, INL, INL
246    vpclmulqdq   $0x11, HKEY2, INH, INH
247    vpclmulqdq   $0x10, HKEY1_2, INM, INM
248    vxorps   INL, IN_L, IN_L                // H * In_3 + H^2 * In_2
249    vxorps   INH, IN_H, IN_H
250    vxorps   INM, IN_M, IN_M
251
252    vmovdqu  0x30(HTABLE), HKEY3
253    vmovdqu  0x40(HTABLE), HKEY4
254    vmovdqu  0x50(HTABLE), HKEY1_2
255
256    vmovdqu  0x10(INPUT_IN), INL            // load In_1, In_0
257    vmovdqu  (INPUT_IN), TEMP1
258    vpshufb  MASK, INL, INL
259    vpshufb  MASK, TEMP1, TEMP1
260
261    vmovdqa  INL, INH                       // H^3 * In_1
262    vpshufd  $0x4e, INL, INM
263    vpxor    INL, INM, INM
264    vpclmulqdq   $0x00, HKEY3, INL, INL
265    vpclmulqdq   $0x11, HKEY3, INH, INH
266    vpclmulqdq   $0x00, HKEY1_2, INM, INM
267    vxorps   INL, IN_L, IN_L                // H * In_3 + H^2 * In_2 + H^3 * In_1
268    vxorps   INH, IN_H, IN_H
269    vxorps   INM, IN_M, IN_M
270
271    vpxor    TEMP1, XI_L, XI_L              // (In_1 + Xi)
272    vmovdqa  XI_L, XI_H
273    vpshufd  $0x4e, XI_L, TEMP1
274    vpxor    XI_L, TEMP1, TEMP1
275    vpclmulqdq   $0x00, HKEY4, XI_L, XI_L   // H^4 * (In_1 + Xi)
276    vpclmulqdq   $0x11, HKEY4, XI_H, XI_H
277    vpclmulqdq   $0x10, HKEY1_2, TEMP1, TEMP1
278    vxorps   IN_L, XI_L, XI_L               // H * In_3 + H^2 * In_2 + H^3 * In_1 + H^4 * (In_1 + Xi)
279    vxorps   IN_H, XI_H, XI_H
280    vxorps   IN_M, TEMP1, TEMP1
281
282    vpxor    XI_L, TEMP1, TEMP1
283    vpxor    XI_H, TEMP1, TEMP1
284    vmovdqa  TEMP1, TEMP2
285    vpslldq  $8, TEMP1, TEMP1
286    vpsrldq  $8, TEMP2, TEMP2
287    vpxor    TEMP1, XI_L, XI_L
288    vpxor    TEMP2, XI_H, XI_H
289
290    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
291    cmp     $0x00, LEN
292    jz     .Lend                            // finshed all blocks
293    leaq     0x40(INPUT_IN), INPUT_IN
294    vmovdqu  0x20(HTABLE), HKEY1_2
295    cmp     $0x40, LEN
296    jae .Lmul_4blocks
297    cmp     $0x20, LEN
298    jae .Lremain_Least_2blocks
299    jmp .Lremain_1block
300
301.align	32
302.Lremain_Least_2blocks:
303    subq $0x20, LEN
304    vmovdqu  0x10(INPUT_IN), IN_L           // loda (4 * i) + 1 or 2 block
305    vmovdqu  (INPUT_IN), TEMP1
306    vpshufb  MASK, IN_L, IN_L
307    vpshufb  MASK, TEMP1, TEMP1
308    vpxor    TEMP1, XI_L, XI_L
309
310    vmovdqa  IN_L, IN_H
311    vpshufd  $0x4e, IN_L, IN_M
312    vpxor    IN_L, IN_M, IN_M
313    vpclmulqdq   $0x00, HKEY, IN_L, IN_L
314    vpclmulqdq   $0x11, HKEY, IN_H, IN_H
315    vpclmulqdq   $0x00, HKEY1_2, IN_M, IN_M
316
317    vmovdqa  XI_L, XI_H
318    vpshufd  $0x4e, XI_L, TEMP1
319    vpxor    XI_L, TEMP1, TEMP1
320    vpclmulqdq   $0x00, HKEY2, XI_L, XI_L
321    vpclmulqdq   $0x11, HKEY2, XI_H, XI_H
322    vpclmulqdq   $0x10, HKEY1_2, TEMP1, TEMP1
323    vxorps   IN_L, XI_L, XI_L
324    vxorps   IN_H, XI_H, XI_H
325    vxorps   IN_M, TEMP1, TEMP1
326
327    vpxor    XI_L, TEMP1, TEMP1
328    vpxor    XI_H, TEMP1, TEMP1
329    vmovdqa  TEMP1, TEMP2
330    vpslldq  $8, TEMP1, TEMP1
331    vpsrldq  $8, TEMP2, TEMP2
332    vpxor    TEMP1, XI_L, XI_L
333    vpxor    TEMP2, XI_H, XI_H
334
335    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
336    cmp $0x00, LEN
337    jz  .Lend
338    leaq 0x20(INPUT_IN), INPUT_IN
339
340.align	32
341.Lremain_1block:
342    subq $0x10, LEN
343    vmovdqu (INPUT_IN), TEMP1
344    vpshufb  MASK, TEMP1, TEMP1
345    vpxor TEMP1, XI_L, XI_L
346
347    GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2
348    REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly
349
350.Lend:
351    vpshufb  MASK, XI_L, XI_L
352    vmovdqu  XI_L, (INPUT_XI)
353    vpxor    HKEY, HKEY, HKEY               // clear hTable
354    vpxor    HKEY1_2, HKEY1_2, HKEY1_2
355    vpxor    HKEY2, HKEY2, HKEY2
356    vpxor    HKEY3, HKEY3, HKEY3
357    vpxor    HKEY4, HKEY4, HKEY4
358    ret
359.cfi_endproc
360.size	GcmHashMultiBlock, .-GcmHashMultiBlock
361
362#endif
363