• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CHACHA20
18
19/* --------------AVX2 Overall design-----------------
20 * 64->%xmm0-%xmm7 No need to use stack memory
21 * 128->%xmm0-%xmm11 No need to use stack memory
22 * 256->%xmm0-%xmm15 Use 256 + 64 bytes of stack memory
23 * 512->%ymm0-%ymm15 Use 512 + 128 bytes of stack memory
24 *
25 --------------AVX512 Overall design-----------------
26 * 64->%xmm0-%xmm7 No need to use stack memory
27 * 128->%xmm0-%xmm11 No need to use stack memory
28 * 256->%xmm0-%xmm31 Use 64-byte stack memory
29 * 512->%ymm0-%ymm31 Use 128-byte stack memory
30 * 1024->%zmm0-%zmm31 Use 256-byte stack memory
31 */
32
33/*************************************************************************************
34 * AVX2/AVX512 Generic Instruction Set Using Macros
35 *************************************************************************************/
36
37/* %xmm0-15 load STATE Macro. */
38.macro LOAD_STATE s0 s1 s2 s3 adr
39    vmovdqu    (\adr),   \s0           // state[0-3]
40    vmovdqu    16(\adr), \s1           // state[4-7]
41    vmovdqu    32(\adr), \s2           // state[8-11]
42    vmovdqu    48(\adr), \s3           // state[12-15]
43.endm
44
45/* %ymm0-15 load STATE Macro. */
46.macro LOAD_512_STATE s0 s1 s2 s3 adr
47    vbroadcasti128 (\adr),   \s0    // state[0-3]
48    vbroadcasti128 16(\adr), \s1    // state[4-7]
49    vbroadcasti128 32(\adr), \s2    // state[8-11]
50    vbroadcasti128 48(\adr), \s3    // state[12-15]
51.endm
52
53/*
54 * %xmm0-15, %ymm0-15 MATRIX TO STATE
55 * IN: s0 s1 s2 s3 cur1 cur2
56 * OUT: s0 s3 cur1 cur2
57 * xmm:
58 * {A0 B0 C0 D0} => {A0 A1 A2 A3}
59 * {A1 B1 C1 D1}    {B0 B1 B2 B3}
60 * {A2 B2 C2 D2}    {C0 C1 C2 C3}
61 * {A3 B3 C3 D3}    {D0 D1 D2 D3}
62 * ymm:
63 * {A0 B0 C0 D0 E0 F0 G0 H0} => {A0 A1 A2 A3 E0 E1 E2 E3}
64 * {A1 B1 C1 D1 E1 F1 G1 H1}    {B0 B1 B2 B3 F0 F1 F2 F3}
65 * {A2 B2 C2 D2 E2 F2 G2 H2}    {C0 C1 C2 C3 G0 G1 G2 G3}
66 * {A3 B3 C3 D3 E3 F3 G3 H3}    {D0 D1 D2 D3 H0 H1 H2 H3}
67 * zmm:
68 * {A0 B0 C0 D0 E0 F0 G0 H0 I0 J0 K0 L0 M0 N0 O0 P0} => {A0 A1 A2 A3 E0 E1 E2 E3 I0 I1 I2 I3 M0 M1 M2 M3}
69 * {A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1}    {B0 B1 B2 B3 F0 F1 F2 F3 J0 J1 J2 J3 N0 N1 N2 N3}
70 * {A2 B2 C2 D2 E2 F2 G2 H2 I2 J2 K2 L2 M2 N2 O2 P2}    {C0 C1 C2 C3 G0 G1 G2 G3 K0 K1 K2 K3 O0 O1 O2 O3}
71 * {A3 B3 C3 D3 E3 F3 G3 H3 I3 J3 K3 L3 M3 N3 O3 P3}    {D0 D1 D2 D3 H0 H1 H2 H3 L0 L1 L2 L3 P0 P1 P2 P3}
72*/
73.macro MATRIX_TO_STATE s0 s1 s2 s3 cur1 cur2
74    vpunpckldq \s1, \s0, \cur1
75    vpunpckldq \s3, \s2, \cur2
76    vpunpckhdq \s1, \s0, \s1
77    vpunpckhdq \s3, \s2, \s2
78
79    vpunpcklqdq \cur2, \cur1, \s0
80    vpunpckhqdq \cur2, \cur1, \s3
81    vpunpcklqdq \s2, \s1, \cur1
82    vpunpckhqdq \s2, \s1, \cur2
83.endm
84
85/*************************************************************************************
86 * AVX2 instruction set use macros
87 *************************************************************************************/
88
89.macro WRITEBACK_64_AVX2 inpos outpos s0 s1 s2 s3
90    vpxor  (\inpos), \s0, \s0
91    vpxor  16(\inpos), \s1, \s1
92    vpxor  32(\inpos), \s2, \s2
93    vpxor  48(\inpos), \s3, \s3
94
95    vmovdqu  \s0, (\outpos)          // write back output
96    vmovdqu  \s1, 16(\outpos)
97    vmovdqu  \s2, 32(\outpos)
98    vmovdqu  \s3, 48(\outpos)
99
100    add $64, \inpos
101    add $64, \outpos
102.endm
103
104/*
105 * Converts a state into a matrix.
106 * %xmm0-15 %ymm0-15 STATE TO MATRIX
107 * s0-s15:Corresponding to 16 wide-bit registers,adr:counter Settings;  base:address of the data storage stack;
108 * per:Register bit width,Byte representation(16、32)
109 */
110.macro STATE_TO_MATRIX s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 base per adr
111    vpshufd $0b00000000, \s3, \s12
112    vpshufd $0b01010101, \s3, \s13
113
114    vpaddd  \adr, \s12, \s12             // 0, 1, 2, 3, 4, 5, 6 ,7
115    vmovdqa \s12, \base+12*\per(%rsp)
116    vpshufd $0b10101010, \s3, \s14
117    vmovdqa \s13, \base+13*\per(%rsp)
118    vpshufd $0b11111111, \s3, \s15
119    vmovdqa \s14, \base+14*\per(%rsp)
120
121    vpshufd $0b00000000, \s2, \s8
122    vmovdqa \s15, \base+15*\per(%rsp)
123    vpshufd $0b01010101, \s2, \s9
124    vmovdqa \s8, \base+8*\per(%rsp)
125    vpshufd $0b10101010, \s2, \s10
126    vmovdqa \s9, \base+9*\per(%rsp)
127    vpshufd $0b11111111, \s2, \s11
128    vmovdqa \s10, \base+10*\per(%rsp)
129
130    vpshufd $0b00000000, \s1, \s4
131    vmovdqa \s11, \base+11*\per(%rsp)
132    vpshufd $0b01010101, \s1, \s5
133    vmovdqa \s4, \base+4*\per(%rsp)
134    vpshufd $0b10101010, \s1, \s6
135    vmovdqa \s5, \base+5*\per(%rsp)
136    vpshufd $0b11111111, \s1, \s7
137    vmovdqa \s6, \base+6*\per(%rsp)
138
139    vpshufd $0b11111111, \s0, \s3
140    vmovdqa \s7, \base+7*\per(%rsp)
141    vpshufd $0b10101010, \s0, \s2
142    vmovdqa \s3, \base+3*\per(%rsp)
143    vpshufd $0b01010101, \s0, \s1
144    vmovdqa \s2, \base+2*\per(%rsp)
145    vpshufd $0b00000000, \s0, \s0
146    vmovdqa \s1, \base+1*\per(%rsp)
147    vmovdqa \s0, \base(%rsp)
148.endm
149
150/*
151 * %xmm0-15 %ymm0-15 LOAD MATRIX
152 */
153.macro LOAD_MATRIX s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 base per adr
154    vmovdqa \base(%rsp), \s0
155    vmovdqa \base+1*\per(%rsp), \s1
156    vmovdqa \base+2*\per(%rsp), \s2
157    vmovdqa \base+3*\per(%rsp), \s3
158    vmovdqa \base+4*\per(%rsp), \s4
159    vmovdqa \base+5*\per(%rsp), \s5
160    vmovdqa \base+6*\per(%rsp), \s6
161    vmovdqa \base+7*\per(%rsp), \s7
162    vmovdqa \base+8*\per(%rsp), \s8
163    vmovdqa \base+9*\per(%rsp), \s9
164    vmovdqa \base+10*\per(%rsp), \s10
165    vmovdqa \base+11*\per(%rsp), \s11
166    vmovdqa \base+12*\per(%rsp), \s12
167    vmovdqa \base+13*\per(%rsp), \s13
168    vpaddd  \adr, \s12, \s12                   // add 8, 8, 8, 8, 8, 8, 8, 8 or 4, 4, 4, 4
169    vmovdqa \base+14*\per(%rsp), \s14
170    vmovdqa \base+15*\per(%rsp), \s15
171    vmovdqa \s12, \base+12*\per(%rsp)
172.endm
173
174/*
175 * %xmm0-15(256) %ymm0-15(512) Loop
176 */
177.macro CHACHA20_LOOP s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 base per A8 ror16 ror8
178
179    /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 |
180     * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8  | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7
181     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 |
182     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8  | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7
183     */
184    COLUM_QUARTER_AVX_0 \s0 \s4 \s12 \s1 \s5 \s13 (\ror16)
185    COLUM_QUARTER_AVX_1 \s8 \s12 \s4 \s9 \s13 \s5 \s10 \s11 $20 $12
186    COLUM_QUARTER_AVX_0 \s0 \s4 \s12 \s1 \s5 \s13 (\ror8)
187    COLUM_QUARTER_AVX_1 \s8 \s12 \s4 \s9 \s13 \s5 \s10 \s11 $25 $7
188    vmovdqa \s8, \base(\A8)
189    vmovdqa \s9, \base+\per(\A8)
190    vmovdqa \base+2*\per(\A8), \s10
191    vmovdqa \base+3*\per(\A8), \s11
192
193    /* 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 |
194     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8  | 10 = 10 + 14, 6 = (6 ^ 10)>>> 7
195     * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 |
196     * 3 = 3 + 7, 15 = (15 ^ 3) >>> 8  | 11 = 11 + 15, 7 = (7 ^ 11)>>> 7
197     */
198    COLUM_QUARTER_AVX_0 \s2 \s6 \s14 \s3 \s7 \s15 (\ror16)
199    COLUM_QUARTER_AVX_1 \s10 \s14 \s6 \s11 \s15 \s7 \s8 \s9 $20 $12
200    COLUM_QUARTER_AVX_0 \s2 \s6 \s14 \s3 \s7 \s15 (\ror8)
201    COLUM_QUARTER_AVX_1 \s10 \s14 \s6 \s11 \s15 \s7 \s8 \s9 $25 $7
202
203    /* 0 = 0 + 5, 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 |
204     * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8  | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7
205     * 1 = 1 + 6, 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 |
206     * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8  | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7
207     */
208    COLUM_QUARTER_AVX_0 \s0 \s5 \s15 \s1 \s6 \s12 (\ror16)
209    COLUM_QUARTER_AVX_1 \s10 \s15 \s5 \s11 \s12 \s6 \s8 \s9 $20 $12
210    COLUM_QUARTER_AVX_0 \s0 \s5 \s15 \s1 \s6 \s12 (\ror8)
211    COLUM_QUARTER_AVX_1 \s10 \s15 \s5 \s11 \s12 \s6 \s8 \s9 $25 $7
212    vmovdqa \s10, \base+2*\per(\A8)
213    vmovdqa \s11, \base+3*\per(\A8)
214    vmovdqa \base(\A8), \s8
215    vmovdqa \base+\per(\A8), \s9
216
217    /* 2 = 2 + 7, 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 |
218     * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8  | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7
219     * 3 = 3 + 4, 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 |
220     * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8  | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7
221     */
222    COLUM_QUARTER_AVX_0 \s2 \s7 \s13 \s3 \s4 \s14 (\ror16)
223    COLUM_QUARTER_AVX_1 \s8 \s13 \s7 \s9 \s14 \s4 \s10 \s11 $20 $12
224    COLUM_QUARTER_AVX_0 \s2 \s7 \s13 \s3 \s4 \s14 (\ror8)
225    COLUM_QUARTER_AVX_1 \s8 \s13 \s7 \s9 \s14 \s4 \s10 \s11 $25 $7
226.endm
227
228/*
229 * %xmm0-15 %ymm0-15 QUARTER macro(used when cyclically moving right by 16 or 8)
230 */
231.macro COLUM_QUARTER_AVX_0 a0 a1 a2 b0 b1 b2 ror
232    vpaddd  \a1, \a0, \a0
233    vpaddd  \b1, \b0, \b0
234    vpxor   \a0, \a2, \a2
235    vpxor   \b0, \b2, \b2
236    vpshufb \ror, \a2, \a2
237    vpshufb \ror, \b2, \b2
238.endm
239
240/*
241 * %xmm0-15 %ymm0-15 QUARTER macro(used when cyclically moving right by 12 or 7)
242 */
243.macro COLUM_QUARTER_AVX_1 a0 a1 a2 b0 b1 b2 cur1 cur2 psr psl
244    vpaddd  \a1, \a0, \a0
245    vpaddd  \b1, \b0, \b0
246    vpxor   \a0, \a2, \a2
247    vpxor   \b0, \b2, \b2
248    vpsrld  \psr, \a2, \cur1
249    vpsrld  \psr, \b2, \cur2
250    vpslld  \psl, \a2, \a2
251    vpslld  \psl, \b2, \b2
252    vpor    \cur1, \a2, \a2
253    vpor    \cur2, \b2, \b2
254.endm
255
256/*************************************************************************************
257 * AVX512 generic instruction set using macros.
258 *************************************************************************************/
259
260/* %zmm0-15 LOAD STATE MACRO. */
261.macro LOAD_1024_STATE s0 s1 s2 s3 adr
262    vbroadcasti32x4 (\adr),   \s0    // state[0-3]
263    vbroadcasti32x4 16(\adr), \s1    // state[4-7]
264    vbroadcasti32x4 32(\adr), \s2    // state[8-11]
265    vbroadcasti32x4 48(\adr), \s3    // state[12-15]
266.endm
267
268.macro WRITEBACK_64_AVX512 inpos outpos s0 s1 s2 s3
269    vpxord  (\inpos), \s0, \s0
270    vpxord  16(\inpos), \s1, \s1
271    vpxord  32(\inpos), \s2, \s2
272    vpxord  48(\inpos), \s3, \s3
273
274    vmovdqu32  \s0, (\outpos)          // Write back output.
275    vmovdqu32  \s1, 16(\outpos)
276    vmovdqu32  \s2, 32(\outpos)
277    vmovdqu32  \s3, 48(\outpos)
278
279    add $64, \inpos
280    add $64, \outpos
281.endm
282
283/*
284 * %zmm0-15 STATE TO MATRIX
285 */
286.macro STATE_TO_MATRIX_Z_AVX512 in out0 out1 out2 out3
287    // {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} .... {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15}
288    vpshufd $0b00000000, \in, \out0
289    vpshufd $0b01010101, \in, \out1
290    vpshufd $0b10101010, \in, \out2
291    vpshufd $0b11111111, \in, \out3
292.endm
293
294/* AVX512 instruction set
295 * %zmm0-31(1024) QUARTER
296 */
297.macro COLUM_QUARTER_AVX512_4 s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 ror
298    vpaddd \s4, \s0, \s0
299    vpaddd \s5, \s1, \s1
300    vpaddd \s6, \s2, \s2
301    vpaddd \s7, \s3, \s3
302
303    vpxord \s0, \s8, \s8
304    vpxord \s1, \s9, \s9
305    vpxord \s2, \s10, \s10
306    vpxord \s3, \s11, \s11
307
308    vprold \ror, \s8, \s8
309    vprold \ror, \s9, \s9
310    vprold \ror, \s10, \s10
311    vprold \ror, \s11, \s11
312.endm
313
314/* AVX512 instruction set
315 * %xmm0-15(256) %ymm0-15(512) %zmm0-31(1024) Loop
316 */
317.macro CHACHA20_LOOP_AVX512 s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s10 s11 s12 s13 s14 s15
318
319    /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 |
320     * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8  | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7
321     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 |
322     * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8  | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7
323     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 |
324     * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8  | 10 = 10 + 14, 6 = (6 ^ 10)>>> 7
325     * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 |
326     * 3 = 3 + 7, 15 = (15 ^ 3) >>> 8  | 11 = 11 + 15, 7 = (7 ^ 11)>>> 7
327     */
328    COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s04 \s05 \s06 \s07 \s12 \s13 \s14 \s15 $16
329    COLUM_QUARTER_AVX512_4 \s08 \s09 \s10 \s11 \s12 \s13 \s14 \s15 \s04 \s05 \s06 \s07 $12
330    COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s04 \s05 \s06 \s07 \s12 \s13 \s14 \s15 $8
331    COLUM_QUARTER_AVX512_4 \s08 \s09 \s10 \s11 \s12 \s13 \s14 \s15 \s04 \s05 \s06 \s07 $7
332
333    /* 0 = 0 + 5, 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 |
334     * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8  | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7
335     * 1 = 1 + 6, 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 |
336     * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8  | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7
337     * 2 = 2 + 7, 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 |
338     * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8  | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7
339     * 3 = 3 + 4, 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 |
340     * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8  | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7
341     */
342    COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s05 \s06 \s07 \s04 \s15 \s12 \s13 \s14 $16
343    COLUM_QUARTER_AVX512_4 \s10 \s11 \s08 \s09 \s15 \s12 \s13 \s14 \s05 \s06 \s07 \s04 $12
344    COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s05 \s06 \s07 \s04 \s15 \s12 \s13 \s14 $8
345    COLUM_QUARTER_AVX512_4 \s10 \s11 \s08 \s09 \s15 \s12 \s13 \s14 \s05 \s06 \s07 \s04 $7
346.endm
347
348#endif
349