• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305)
18
19.file   "poly1305_x86_64_macro.s"
20.text
21
22.align   32
23g_129:
24    .long    1<<24, 0, 1<<24, 0, 1<<24, 0, 1<<24, 0
25.size    g_129, .-g_129
26.align   32
27g_mask26:
28    .long    0x3ffffff, 0, 0x3ffffff, 0, 0x3ffffff, 0, 0x3ffffff, 0
29.size    g_mask26, .-g_mask26
30.align   32
31g_permd_avx2:
32    .long    2, 2, 2, 3, 2, 0, 2, 1
33.size   g_permd_avx2, .-g_permd_avx2
34
35.set    CTX, %rdi
36.set    INP, %rsi
37.set    LEN, %rdx
38.set    PADBIT, %rcx
39
40.set    ACC1, %r14
41.set    ACC2, %rbx
42.set    ACC3, %rbp
43.set    D1, %r8
44.set    D2, %r9
45.set    D3, %r10
46.set    R0, %r11
47.set    R1, %r12
48.set    R2, %r13
49
50.set    YH0, %ymm0
51.set    YH1, %ymm1
52.set    YH2, %ymm2
53.set    YH3, %ymm3
54.set    YH4, %ymm4
55.set    YT0, %ymm5
56.set    YT1, %ymm6
57.set    YT2, %ymm7
58.set    YT3, %ymm8
59.set    YT4, %ymm9
60.set    YMASK, %ymm10
61.set    YB0, %ymm11
62.set    YB1, %ymm12
63.set    YB2, %ymm13
64.set    YB3, %ymm14
65.set    YB4, %ymm15
66
67/**
68 *  Macro description: x86_64 poly1305 big number multiplication modulo basic instruction implementation (acc1|acc2|acc3) = (acc1|acc2|acc3) * (r0|r1) mod P
69 *  Input register:
70 *        acc1-3: accumulator
71 *        r0-1: key r
72 *        r2: r1 + (r1 >> 2)
73 *  Change register: r8-r14, rbx, rbp, rax
74 *  Output register:
75 *        acc1-3: result of the one block operation
76 */
77.macro POLY1305_MOD_MUL acc1 acc2 acc3 r0 r1 r2
78    mulq \acc1                           // acc1 * r1
79    movq %rax, D2
80    movq \r0, %rax
81    movq %rdx, D3
82
83    mulq \acc1                           // acc1 * r0
84    movq %rax, \acc1
85    movq \r0, %rax
86    movq %rdx, D1
87
88    mulq \acc2                           // acc2 * r0
89    addq %rax, D2
90    movq \r2, %rax
91    adcq %rdx, D3
92
93    mulq \acc2                           // acc2 * (r1 + (r1 >> 2))
94    movq \acc3, \acc2
95    addq %rax, \acc1
96    adcq %rdx, D1
97
98    imulq   \r2, \acc2                   // acc3 * (r1 + (r1 >> 2))
99    addq \acc2, D2
100    movq D1, \acc2
101    adcq $0, D3
102
103    imulq   \r0, \acc3                   // acc3 * r0
104    mov $-4, %rax
105    addq D2, \acc2
106    adcq \acc3, D3
107
108    andq D3, %rax                        // reduction
109    movq D3, \acc3
110    shrq $2, D3
111    andq $3, \acc3
112    addq D3, %rax
113    addq %rax, \acc1
114    adcq $0, \acc2
115    adcq $0, \acc3
116.endm
117
118/**
119 *  Macro description: converts 130-bit base2^26 data into base 2^64 data.
120 *  Input register:
121 *        a1: large data block 0 in the original format
122 *        d1: large data block 1 in the original format
123 *        a2: large data block 2 in the original format
124 *        d2: large data block 3 in the original format
125 *        r2: big number of data blocks 2 and 3 in the original format
126 *        a3: large data block 4 in the original format
127 *  Modify the register r8, r9, r13, r14, rbx, rbp.
128 *  Output register:
129 *       a1: bits 0 to 63 of the converted big number
130 *       a2: 64-127 bits of the converted big number
131 *       a3: 128-130 bits of the converted big number
132 * Function/Macro Call: None
133 */
134.macro CONVERT_26TO64    a1 d1 a2 d2 r2 a3
135    shrq $6, \d1
136    shlq $52, \r2
137    shrq $12, \a2
138    addq \d1, \a1
139    shrq $18, \d2
140    addq \r2, \a1                              // 1st 64bit
141
142    adcq \d2, \a2
143    movq \a3, \d1
144    shlq $40, \d1
145    shrq $24, \a3
146    addq \d1, \a2                              // 2nd 64bit
147    adcq $0, \a3                               // 3rd 64bit
148.endm
149
150/**
151 *  Macro description: converts 130-bit base2^64 data to base 2^26 data.
152 *  Input register:
153 *        a1: large data block 0 in the original format
154 *        a2: large data block 1 in the original format
155 *        a3: large data block 2 in the original format
156 *  Modify the register: r8, r9, r14, rax, rdx, rbp, rbx.
157 *  Output register:
158 *       a4: 0 to 25 digits of the converted big number
159 *       a5: 26 to 51 digits of the converted big number
160 *       a1: 52 to 77 digits of the converted big number
161 *       a2: 78 to 103 bits of the converted big number
162 *       a3: 104-130 bits of the converted big number
163 *  Function/Macro Call: None
164 */
165.macro CONVERT_64TO26    a1 a2 a3 a4 a5
166    movq \a1, \a4
167    movq \a1, \a5
168    andq $0x3ffffff, \a4                        // 1st 26bit
169    shrq $26, \a5
170    movd \a4, %xmm0
171    andq $0x3ffffff, \a5                        // 2nd 26bit
172    shrq $52, \a1
173    movd \a5, %xmm1
174    movq \a2, D1
175    movq \a2, D2
176    shlq $12, D1
177    orq  D1, \a1
178    andq $0x3ffffff, \a1                        // 3rd 26bit
179    shrq $14, \a2
180    movd \a1, %xmm2
181    shlq $24, \a3
182    andq $0x3ffffff, \a2                        // 4th 26bit
183    shrq $40, D2
184    movd \a2, %xmm3
185    orq  D2, \a3                                // 5th 26bit
186    movl $1, 220(CTX)
187    movd \a3, %xmm4
188
189.endm
190
191/**
192 *  Macro description: preprocessing of converting base2^26 data to base 2^64
193 *  Input register: 128 bits of acc1 and acc2 data
194 *  Change register: r8-r10, r14, and rbx.
195 *  Output register: acc1, acc2, d1, d2, d3
196 */
197.macro CONVERT_26TO64_PRE   acc1 acc2 d1 d2 d3
198    movq $0xffffffff, \d3                       // base2_26 --> base2_64
199    movq \acc1, \d1
200    movq \acc2, \d2
201    andq \d3, \acc1
202    andq \d3, \acc2
203    andq $-1*(1<<31), \d1
204    movq \d2, \d3
205    andq $-1*(1<<31), \d2
206.endm
207
208/**
209 *  Macro description: load accumulator data and key r
210 *  Input register: in_ctx context
211 *  Modify the register: r8, r11-r14, rax, rbp, rbx.
212 *  Output register:
213 *      r0 - r2: key r
214 *      acc1 - acc3: accumulator data
215 *      flag: indicates the data organization flag of the current accumulator.
216 *      mul: r1
217 */
218.macro LOAD_ACC_R   inctx r0 r1 r2 acc1 acc2 acc3 flag mul
219    movq 24(\inctx), \r0                        // load r
220    movq 32(\inctx), \r1
221    movl 220(\inctx), \flag                     // judge the ACC organization form.
222    movq \r1, \r2
223    movq (\inctx), \acc1                        // load acc
224    shrq $2, \r2
225    movq 8(\inctx), \acc2
226    addq \r1, \r2                               // R2 = R1 + (R1 >> 2)
227    movq 16(\inctx), \acc3
228    movq \r1, \mul
229.endm
230
231/**
232 *  Macro description: The avx2 instruction set implements parallel operation of the last four blocks.
233 *  Input register:
234 *      yh0 - yh4: stores messages.
235 *      yt0 - yt4: stores keys.
236 *      yb0 - yb4: temporary storage of intermediate results
237 *      addr: stack address
238 *  Output register:
239 *      yh0 - yh4: store operation results.
240 */
241.macro BLOCK4_AVX2_TAIL   yt0 yt1 yt2 yt3 yt4 yh0 yh1 yh2 yh3 yh4 yb0 yb1 yb2 yb3 yb4 ymask addr
242    vpaddq      \yt0, \yh0, \yh0
243    vpaddq      \yt1, \yh1, \yh1
244    vpaddq      \yt3, \yh3, \yh3
245    vpaddq      \yt4, \yh4, \yh4
246    vmovdqu     0x4(\addr), \yt0                          // r0^i
247    vmovdqu     0x24(\addr), \yt1                         // r1^i
248    vmovdqu     0x64(\addr), \yt2                         // r2^i
249    vmovdqu     0xc4(\addr), \yt3                         // s3^i
250    vmovdqu     0x104(\addr), \ymask                      // s4^i
251
252    vpmuludq    \yh2, \yt0, \yb2                          // b2 = h2 * r0^i
253    vpmuludq    \yh2, \yt1, \yb3                          // b3 = h2 * r1^i
254    vpmuludq    \yh2, \yt2, \yb4                          // b4 = h2 * r2^i
255    vpmuludq    \yh2, \yt3, \yb0                          // b0 = h2 * s3^i
256    vpmuludq    \yh2, \ymask, \yb1                        // b1 = h2 * s4^i
257
258    vpmuludq    \yh1, \yt1, \yt4                          // h1 * r1^i
259    vpmuludq    \yh0, \yt1, \yh2                          // h0 * r1^i
260    vpaddq      \yt4, \yb2, \yb2                          // b2 += h1 * r1^i
261    vpaddq      \yh2, \yb1, \yb1                          // b1 += h0 * r1^i
262    vpmuludq    \yh3, \yt1, \yt4                          // h3 * r1^i
263    vpmuludq    0x44(\addr), \yh4, \yh2                   // h4 * s1^i
264    vpaddq      \yt4, \yb4, \yb4                          // b4 += h3 * r1^i
265    vpaddq      \yh2, \yb0, \yb0                          // b0 += h4 * s1^i
266    vmovdqu     0x84(\addr), \yt1                         // load s2^i
267
268    vpmuludq    \yh4, \yt0, \yt4                          // h4 * r0^i
269    vpmuludq    \yh3, \yt0, \yh2                          // h3 * r0^i
270    vpaddq      \yt4, \yb4, \yb4                          // b4 += h4 * r0^i
271    vpaddq      \yh2, \yb3, \yb3                          // b3 += h3 * r0^i
272    vpmuludq    \yh0, \yt0, \yt4                          // h0 * r0^i
273    vpmuludq    \yh1, \yt0, \yh2                          // h1 * r0^i
274    vpaddq      \yt4, \yb0, \yb0                          // b0 += h0 * r0^i
275    vpaddq      \yh2, \yb1, \yb1                          // b1 += h1 * r0^i
276
277    vpmuludq    \yh1, \yt2, \yt4                          // h1 * r2^i
278    vpmuludq    \yh0, \yt2, \yh2                          // h0 * r2^i
279    vpaddq      \yt4, \yb3, \yb3                          // b3 += h1 * r2^i
280    vpaddq      \yh2, \yb2, \yb2                          // b2 += h0 * r2^i
281    vpmuludq    \yh4, \yt1, \yt4                          // h4 * s2^i
282    vpmuludq    \yh3, \yt1, \yh2                          // h3 * s2^i
283    vpaddq      \yt4, \yb1, \yb1                          // b1 += h4 * s2^i
284    vpaddq      \yh2, \yb0, \yb0                          // b0 += h3 * s2^i
285    vmovdqu     0xa4(\addr), \yh2                         // load r3^i
286
287    vpmuludq    \yh1, \yh2, \yt4                          // h1 * r3^i
288    vpmuludq    \yh0, \yh2, \yh2                          // h0 * r3^i
289    vpaddq      \yt4, \yb4, \yb4                          // b4 += h1 * r3^i
290    vpaddq      \yh2, \yb3, \yb3                          // b3 += h0 * r3^i
291    vpmuludq    \yh4, \yt3, \yt4                          // h4 * s3^i
292    vpmuludq    \yh3, \yt3, \yh2                          // h3 * s3^i
293    vpaddq      \yt4, \yb2, \yb2                          // b2 += h4 * s3^i
294    vpaddq      \yh2, \yb1, \yb1                          // b1 += h3 * s3^i   (finish)
295
296    vpmuludq    \yh3, \ymask, \yh3                        // h3 * s4^i
297    vpmuludq    \yh4, \ymask, \yh4                        // h4 * s4^i
298    vpaddq  \yb2, \yh3, \yh2                              // h2 += h3 * s4^i   (finish)
299    vpaddq  \yb3, \yh4, \yh3                              // h3 += h4 * s4^i   (finish)
300    vpmuludq    0xe4(\addr), \yh0, \yh4                   // h0 * r4^i
301    vpmuludq    \yh1, \ymask, \yh0                        // h1 * s4^i
302    vmovdqu     g_mask26(%rip), \ymask
303    vpaddq  \yh4, \yb4, \yh4                              // h4 += h0 * r4^i   (finish)
304    vpaddq  \yh0, \yb0, \yh0                              // h0 += h1 * s4^i   (finish)
305
306    // Summary of calculation results of different blocks
307    vpsrldq     $8, \yh0, \yt0
308    vpsrldq     $8, \yb1, \yt1
309    vpaddq      \yt0, \yh0, \yh0
310    vpsrldq     $8, \yh2, \yt2
311    vpaddq      \yt1, \yb1, \yb1
312    vpsrldq     $8, \yh3, \yt3
313    vpaddq      \yt2, \yh2, \yh2
314    vpsrldq     $8, \yh4, \yt4
315    vpaddq      \yt3, \yh3, \yh3
316    vpaddq      \yt4, \yh4, \yh4
317
318    vpermq      $0x2, \yh0, \yt0
319    vpermq      $0x2, \yb1, \yt1
320    vpaddq      \yt0, \yh0, \yh0
321    vpermq      $0x2, \yh2, \yt2
322    vpaddq      \yt1, \yb1, \yb1
323    vpermq      $0x2, \yh3, \yt3
324    vpaddq      \yt2, \yh2, \yh2
325    vpermq      $0x2, \yh4, \yt4
326    vpaddq      \yt3, \yh3, \yh3
327    vpaddq      \yt4, \yh4, \yh4
328
329    // reduction
330    vpsrlq      $26, \yh3, \yb3
331    vpand       \ymask, \yh3, \yh3
332    vpaddq      \yb3, \yh4, \yh4                          // h3 -> h4
333    vpsrlq      $26, \yh0, \yb0
334    vpand       \ymask, \yh0, \yh0
335    vpaddq      \yb0, \yb1, \yh1                          // h0 -> h1
336    vpsrlq      $26, \yh4, \yb4
337    vpand       \ymask, \yh4, \yh4
338    vpsrlq      $26, \yh1, \yb1
339    vpand       \ymask, \yh1, \yh1
340    vpaddq      \yb1, \yh2, \yh2                          // h1 -> h2
341    vpaddq      \yb4, \yh0, \yh0
342    vpsllq      $2, \yb4, \yb4
343    vpaddq      \yb4, \yh0, \yh0                          // h4 -> h0
344    vpsrlq      $26, \yh2, \yb2
345    vpand       \ymask, \yh2, \yh2
346    vpaddq      \yb2, \yh3, \yh3                          // h2 -> h3
347    vpsrlq      $26, \yh0, \yb0
348    vpand       \ymask, \yh0, \yh0
349    vpaddq      \yb0, \yh1, \yh1                          // h0 -> h1
350    vpsrlq      $26, \yh3, \yb3
351    vpand       \ymask, \yh3, \yh3
352    vpaddq      \yb3, \yh4, \yh4                          // h3 -> h4
353.endm
354
355#endif
356