1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CHACHA20 18 19/* --------------AVX2 Overall design----------------- 20 * 64->%xmm0-%xmm7 No need to use stack memory 21 * 128->%xmm0-%xmm11 No need to use stack memory 22 * 256->%xmm0-%xmm15 Use 256 + 64 bytes of stack memory 23 * 512->%ymm0-%ymm15 Use 512 + 128 bytes of stack memory 24 * 25 --------------AVX512 Overall design----------------- 26 * 64->%xmm0-%xmm7 No need to use stack memory 27 * 128->%xmm0-%xmm11 No need to use stack memory 28 * 256->%xmm0-%xmm31 Use 64-byte stack memory 29 * 512->%ymm0-%ymm31 Use 128-byte stack memory 30 * 1024->%zmm0-%zmm31 Use 256-byte stack memory 31 */ 32 33/************************************************************************************* 34 * AVX2/AVX512 Generic Instruction Set Using Macros 35 *************************************************************************************/ 36 37/* %xmm0-15 load STATE Macro. */ 38.macro LOAD_STATE s0 s1 s2 s3 adr 39 vmovdqu (\adr), \s0 // state[0-3] 40 vmovdqu 16(\adr), \s1 // state[4-7] 41 vmovdqu 32(\adr), \s2 // state[8-11] 42 vmovdqu 48(\adr), \s3 // state[12-15] 43.endm 44 45/* %ymm0-15 load STATE Macro. */ 46.macro LOAD_512_STATE s0 s1 s2 s3 adr 47 vbroadcasti128 (\adr), \s0 // state[0-3] 48 vbroadcasti128 16(\adr), \s1 // state[4-7] 49 vbroadcasti128 32(\adr), \s2 // state[8-11] 50 vbroadcasti128 48(\adr), \s3 // state[12-15] 51.endm 52 53/* 54 * %xmm0-15, %ymm0-15 MATRIX TO STATE 55 * IN: s0 s1 s2 s3 cur1 cur2 56 * OUT: s0 s3 cur1 cur2 57 * xmm: 58 * {A0 B0 C0 D0} => {A0 A1 A2 A3} 59 * {A1 B1 C1 D1} {B0 B1 B2 B3} 60 * {A2 B2 C2 D2} {C0 C1 C2 C3} 61 * {A3 B3 C3 D3} {D0 D1 D2 D3} 62 * ymm: 63 * {A0 B0 C0 D0 E0 F0 G0 H0} => {A0 A1 A2 A3 E0 E1 E2 E3} 64 * {A1 B1 C1 D1 E1 F1 G1 H1} {B0 B1 B2 B3 F0 F1 F2 F3} 65 * {A2 B2 C2 D2 E2 F2 G2 H2} {C0 C1 C2 C3 G0 G1 G2 G3} 66 * {A3 B3 C3 D3 E3 F3 G3 H3} {D0 D1 D2 D3 H0 H1 H2 H3} 67 * zmm: 68 * {A0 B0 C0 D0 E0 F0 G0 H0 I0 J0 K0 L0 M0 N0 O0 P0} => {A0 A1 A2 A3 E0 E1 E2 E3 I0 I1 I2 I3 M0 M1 M2 M3} 69 * {A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1} {B0 B1 B2 B3 F0 F1 F2 F3 J0 J1 J2 J3 N0 N1 N2 N3} 70 * {A2 B2 C2 D2 E2 F2 G2 H2 I2 J2 K2 L2 M2 N2 O2 P2} {C0 C1 C2 C3 G0 G1 G2 G3 K0 K1 K2 K3 O0 O1 O2 O3} 71 * {A3 B3 C3 D3 E3 F3 G3 H3 I3 J3 K3 L3 M3 N3 O3 P3} {D0 D1 D2 D3 H0 H1 H2 H3 L0 L1 L2 L3 P0 P1 P2 P3} 72*/ 73.macro MATRIX_TO_STATE s0 s1 s2 s3 cur1 cur2 74 vpunpckldq \s1, \s0, \cur1 75 vpunpckldq \s3, \s2, \cur2 76 vpunpckhdq \s1, \s0, \s1 77 vpunpckhdq \s3, \s2, \s2 78 79 vpunpcklqdq \cur2, \cur1, \s0 80 vpunpckhqdq \cur2, \cur1, \s3 81 vpunpcklqdq \s2, \s1, \cur1 82 vpunpckhqdq \s2, \s1, \cur2 83.endm 84 85/************************************************************************************* 86 * AVX2 instruction set use macros 87 *************************************************************************************/ 88 89.macro WRITEBACK_64_AVX2 inpos outpos s0 s1 s2 s3 90 vpxor (\inpos), \s0, \s0 91 vpxor 16(\inpos), \s1, \s1 92 vpxor 32(\inpos), \s2, \s2 93 vpxor 48(\inpos), \s3, \s3 94 95 vmovdqu \s0, (\outpos) // write back output 96 vmovdqu \s1, 16(\outpos) 97 vmovdqu \s2, 32(\outpos) 98 vmovdqu \s3, 48(\outpos) 99 100 add $64, \inpos 101 add $64, \outpos 102.endm 103 104/* 105 * Converts a state into a matrix. 106 * %xmm0-15 %ymm0-15 STATE TO MATRIX 107 * s0-s15:Corresponding to 16 wide-bit registers,adr:counter Settings; base:address of the data storage stack; 108 * per:Register bit width,Byte representation(16、32) 109 */ 110.macro STATE_TO_MATRIX s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 base per adr 111 vpshufd $0b00000000, \s3, \s12 112 vpshufd $0b01010101, \s3, \s13 113 114 vpaddd \adr, \s12, \s12 // 0, 1, 2, 3, 4, 5, 6 ,7 115 vmovdqa \s12, \base+12*\per(%rsp) 116 vpshufd $0b10101010, \s3, \s14 117 vmovdqa \s13, \base+13*\per(%rsp) 118 vpshufd $0b11111111, \s3, \s15 119 vmovdqa \s14, \base+14*\per(%rsp) 120 121 vpshufd $0b00000000, \s2, \s8 122 vmovdqa \s15, \base+15*\per(%rsp) 123 vpshufd $0b01010101, \s2, \s9 124 vmovdqa \s8, \base+8*\per(%rsp) 125 vpshufd $0b10101010, \s2, \s10 126 vmovdqa \s9, \base+9*\per(%rsp) 127 vpshufd $0b11111111, \s2, \s11 128 vmovdqa \s10, \base+10*\per(%rsp) 129 130 vpshufd $0b00000000, \s1, \s4 131 vmovdqa \s11, \base+11*\per(%rsp) 132 vpshufd $0b01010101, \s1, \s5 133 vmovdqa \s4, \base+4*\per(%rsp) 134 vpshufd $0b10101010, \s1, \s6 135 vmovdqa \s5, \base+5*\per(%rsp) 136 vpshufd $0b11111111, \s1, \s7 137 vmovdqa \s6, \base+6*\per(%rsp) 138 139 vpshufd $0b11111111, \s0, \s3 140 vmovdqa \s7, \base+7*\per(%rsp) 141 vpshufd $0b10101010, \s0, \s2 142 vmovdqa \s3, \base+3*\per(%rsp) 143 vpshufd $0b01010101, \s0, \s1 144 vmovdqa \s2, \base+2*\per(%rsp) 145 vpshufd $0b00000000, \s0, \s0 146 vmovdqa \s1, \base+1*\per(%rsp) 147 vmovdqa \s0, \base(%rsp) 148.endm 149 150/* 151 * %xmm0-15 %ymm0-15 LOAD MATRIX 152 */ 153.macro LOAD_MATRIX s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 base per adr 154 vmovdqa \base(%rsp), \s0 155 vmovdqa \base+1*\per(%rsp), \s1 156 vmovdqa \base+2*\per(%rsp), \s2 157 vmovdqa \base+3*\per(%rsp), \s3 158 vmovdqa \base+4*\per(%rsp), \s4 159 vmovdqa \base+5*\per(%rsp), \s5 160 vmovdqa \base+6*\per(%rsp), \s6 161 vmovdqa \base+7*\per(%rsp), \s7 162 vmovdqa \base+8*\per(%rsp), \s8 163 vmovdqa \base+9*\per(%rsp), \s9 164 vmovdqa \base+10*\per(%rsp), \s10 165 vmovdqa \base+11*\per(%rsp), \s11 166 vmovdqa \base+12*\per(%rsp), \s12 167 vmovdqa \base+13*\per(%rsp), \s13 168 vpaddd \adr, \s12, \s12 // add 8, 8, 8, 8, 8, 8, 8, 8 or 4, 4, 4, 4 169 vmovdqa \base+14*\per(%rsp), \s14 170 vmovdqa \base+15*\per(%rsp), \s15 171 vmovdqa \s12, \base+12*\per(%rsp) 172.endm 173 174/* 175 * %xmm0-15(256) %ymm0-15(512) Loop 176 */ 177.macro CHACHA20_LOOP s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 base per A8 ror16 ror8 178 179 /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 | 180 * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7 181 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 | 182 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7 183 */ 184 COLUM_QUARTER_AVX_0 \s0 \s4 \s12 \s1 \s5 \s13 (\ror16) 185 COLUM_QUARTER_AVX_1 \s8 \s12 \s4 \s9 \s13 \s5 \s10 \s11 $20 $12 186 COLUM_QUARTER_AVX_0 \s0 \s4 \s12 \s1 \s5 \s13 (\ror8) 187 COLUM_QUARTER_AVX_1 \s8 \s12 \s4 \s9 \s13 \s5 \s10 \s11 $25 $7 188 vmovdqa \s8, \base(\A8) 189 vmovdqa \s9, \base+\per(\A8) 190 vmovdqa \base+2*\per(\A8), \s10 191 vmovdqa \base+3*\per(\A8), \s11 192 193 /* 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 | 194 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 7 195 * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 | 196 * 3 = 3 + 7, 15 = (15 ^ 3) >>> 8 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 7 197 */ 198 COLUM_QUARTER_AVX_0 \s2 \s6 \s14 \s3 \s7 \s15 (\ror16) 199 COLUM_QUARTER_AVX_1 \s10 \s14 \s6 \s11 \s15 \s7 \s8 \s9 $20 $12 200 COLUM_QUARTER_AVX_0 \s2 \s6 \s14 \s3 \s7 \s15 (\ror8) 201 COLUM_QUARTER_AVX_1 \s10 \s14 \s6 \s11 \s15 \s7 \s8 \s9 $25 $7 202 203 /* 0 = 0 + 5, 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 | 204 * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7 205 * 1 = 1 + 6, 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 | 206 * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7 207 */ 208 COLUM_QUARTER_AVX_0 \s0 \s5 \s15 \s1 \s6 \s12 (\ror16) 209 COLUM_QUARTER_AVX_1 \s10 \s15 \s5 \s11 \s12 \s6 \s8 \s9 $20 $12 210 COLUM_QUARTER_AVX_0 \s0 \s5 \s15 \s1 \s6 \s12 (\ror8) 211 COLUM_QUARTER_AVX_1 \s10 \s15 \s5 \s11 \s12 \s6 \s8 \s9 $25 $7 212 vmovdqa \s10, \base+2*\per(\A8) 213 vmovdqa \s11, \base+3*\per(\A8) 214 vmovdqa \base(\A8), \s8 215 vmovdqa \base+\per(\A8), \s9 216 217 /* 2 = 2 + 7, 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 | 218 * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7 219 * 3 = 3 + 4, 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 | 220 * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7 221 */ 222 COLUM_QUARTER_AVX_0 \s2 \s7 \s13 \s3 \s4 \s14 (\ror16) 223 COLUM_QUARTER_AVX_1 \s8 \s13 \s7 \s9 \s14 \s4 \s10 \s11 $20 $12 224 COLUM_QUARTER_AVX_0 \s2 \s7 \s13 \s3 \s4 \s14 (\ror8) 225 COLUM_QUARTER_AVX_1 \s8 \s13 \s7 \s9 \s14 \s4 \s10 \s11 $25 $7 226.endm 227 228/* 229 * %xmm0-15 %ymm0-15 QUARTER macro(used when cyclically moving right by 16 or 8) 230 */ 231.macro COLUM_QUARTER_AVX_0 a0 a1 a2 b0 b1 b2 ror 232 vpaddd \a1, \a0, \a0 233 vpaddd \b1, \b0, \b0 234 vpxor \a0, \a2, \a2 235 vpxor \b0, \b2, \b2 236 vpshufb \ror, \a2, \a2 237 vpshufb \ror, \b2, \b2 238.endm 239 240/* 241 * %xmm0-15 %ymm0-15 QUARTER macro(used when cyclically moving right by 12 or 7) 242 */ 243.macro COLUM_QUARTER_AVX_1 a0 a1 a2 b0 b1 b2 cur1 cur2 psr psl 244 vpaddd \a1, \a0, \a0 245 vpaddd \b1, \b0, \b0 246 vpxor \a0, \a2, \a2 247 vpxor \b0, \b2, \b2 248 vpsrld \psr, \a2, \cur1 249 vpsrld \psr, \b2, \cur2 250 vpslld \psl, \a2, \a2 251 vpslld \psl, \b2, \b2 252 vpor \cur1, \a2, \a2 253 vpor \cur2, \b2, \b2 254.endm 255 256/************************************************************************************* 257 * AVX512 generic instruction set using macros. 258 *************************************************************************************/ 259 260/* %zmm0-15 LOAD STATE MACRO. */ 261.macro LOAD_1024_STATE s0 s1 s2 s3 adr 262 vbroadcasti32x4 (\adr), \s0 // state[0-3] 263 vbroadcasti32x4 16(\adr), \s1 // state[4-7] 264 vbroadcasti32x4 32(\adr), \s2 // state[8-11] 265 vbroadcasti32x4 48(\adr), \s3 // state[12-15] 266.endm 267 268.macro WRITEBACK_64_AVX512 inpos outpos s0 s1 s2 s3 269 vpxord (\inpos), \s0, \s0 270 vpxord 16(\inpos), \s1, \s1 271 vpxord 32(\inpos), \s2, \s2 272 vpxord 48(\inpos), \s3, \s3 273 274 vmovdqu32 \s0, (\outpos) // Write back output. 275 vmovdqu32 \s1, 16(\outpos) 276 vmovdqu32 \s2, 32(\outpos) 277 vmovdqu32 \s3, 48(\outpos) 278 279 add $64, \inpos 280 add $64, \outpos 281.endm 282 283/* 284 * %zmm0-15 STATE TO MATRIX 285 */ 286.macro STATE_TO_MATRIX_Z_AVX512 in out0 out1 out2 out3 287 // {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} .... {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15} 288 vpshufd $0b00000000, \in, \out0 289 vpshufd $0b01010101, \in, \out1 290 vpshufd $0b10101010, \in, \out2 291 vpshufd $0b11111111, \in, \out3 292.endm 293 294/* AVX512 instruction set 295 * %zmm0-31(1024) QUARTER 296 */ 297.macro COLUM_QUARTER_AVX512_4 s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 ror 298 vpaddd \s4, \s0, \s0 299 vpaddd \s5, \s1, \s1 300 vpaddd \s6, \s2, \s2 301 vpaddd \s7, \s3, \s3 302 303 vpxord \s0, \s8, \s8 304 vpxord \s1, \s9, \s9 305 vpxord \s2, \s10, \s10 306 vpxord \s3, \s11, \s11 307 308 vprold \ror, \s8, \s8 309 vprold \ror, \s9, \s9 310 vprold \ror, \s10, \s10 311 vprold \ror, \s11, \s11 312.endm 313 314/* AVX512 instruction set 315 * %xmm0-15(256) %ymm0-15(512) %zmm0-31(1024) Loop 316 */ 317.macro CHACHA20_LOOP_AVX512 s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s10 s11 s12 s13 s14 s15 318 319 /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 | 320 * 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7 321 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 | 322 * 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7 323 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 | 324 * 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 7 325 * 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 | 326 * 3 = 3 + 7, 15 = (15 ^ 3) >>> 8 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 7 327 */ 328 COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s04 \s05 \s06 \s07 \s12 \s13 \s14 \s15 $16 329 COLUM_QUARTER_AVX512_4 \s08 \s09 \s10 \s11 \s12 \s13 \s14 \s15 \s04 \s05 \s06 \s07 $12 330 COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s04 \s05 \s06 \s07 \s12 \s13 \s14 \s15 $8 331 COLUM_QUARTER_AVX512_4 \s08 \s09 \s10 \s11 \s12 \s13 \s14 \s15 \s04 \s05 \s06 \s07 $7 332 333 /* 0 = 0 + 5, 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 | 334 * 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7 335 * 1 = 1 + 6, 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 | 336 * 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7 337 * 2 = 2 + 7, 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 | 338 * 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7 339 * 3 = 3 + 4, 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 | 340 * 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7 341 */ 342 COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s05 \s06 \s07 \s04 \s15 \s12 \s13 \s14 $16 343 COLUM_QUARTER_AVX512_4 \s10 \s11 \s08 \s09 \s15 \s12 \s13 \s14 \s05 \s06 \s07 \s04 $12 344 COLUM_QUARTER_AVX512_4 \s00 \s01 \s02 \s03 \s05 \s06 \s07 \s04 \s15 \s12 \s13 \s14 $8 345 COLUM_QUARTER_AVX512_4 \s10 \s11 \s08 \s09 \s15 \s12 \s13 \s14 \s05 \s06 \s07 \s04 $7 346.endm 347 348#endif 349