1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CHACHA20 18 19.text 20.LAndBlock: 21.long 1, 0, 0, 0 22.LRor16: 23.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 24.LRor8: 25.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 26 27.set IN, %r9 28.set OUT, %r10 29 30/* Original State */ 31.set O00, %xmm12 32.set O01, %xmm13 33.set O02, %xmm14 34.set O03, %xmm15 35 36/* State 0 */ 37.set S00, %xmm0 // LINE 0 STATE 0 38.set S01, %xmm1 // LINE 1 STATE 0 39.set S02, %xmm2 // LINE 2 STATE 0 40.set S03, %xmm3 // LINE 3 STATE 0 41 42/* State 1 */ 43.set S10, %xmm5 // LINE 0 STATE 1 44.set S11, %xmm6 // LINE 1 STATE 1 45.set S12, %xmm7 // LINE 2 STATE 1 46.set S13, %xmm8 // LINE 3 STATE 1 47 48 49.macro CHACHA20_ROUND S0 S1 S2 S3 CUR 50 paddd \S1, \S0 51 pxor \S0, \S3 52 pshufb .LRor16(%rip), \S3 53 54 paddd \S3, \S2 55 pxor \S2, \S1 56 movdqa \S1, \CUR 57 psrld $20, \S1 58 pslld $12, \CUR 59 por \CUR, \S1 60 61 paddd \S1, \S0 62 pxor \S0, \S3 63 pshufb .LRor8(%rip), \S3 64 65 paddd \S3, \S2 66 pxor \S2, \S1 67 movdqa \S1, \CUR 68 psrld $25, \S1 69 pslld $7, \CUR 70 por \CUR, \S1 71.endm 72 73/* QUARTERROUND for two states */ 74.macro CHACHA20_2_ROUND S0 S1 S2 S3 CUR S4 S5 S6 S7 CUR1 75 paddd \S1, \S0 76 pxor \S0, \S3 77 pshufb .LRor16(%rip), \S3 78 79 paddd \S3, \S2 80 pxor \S2, \S1 81 movdqa \S1, \CUR 82 psrld $20, \S1 83 pslld $12, \CUR 84 por \CUR, \S1 85 86 paddd \S1, \S0 87 pxor \S0, \S3 88 pshufb .LRor8(%rip), \S3 89 90 paddd \S3, \S2 91 pxor \S2, \S1 92 movdqa \S1, \CUR 93 psrld $25, \S1 94 pslld $7, \CUR 95 por \CUR, \S1 96 97 paddd \S5, \S4 98 pxor \S4, \S7 99 pshufb .LRor16(%rip), \S7 100 101 paddd \S7, \S6 102 pxor \S6, \S5 103 movdqa \S5, \CUR1 104 psrld $20, \S5 105 pslld $12, \CUR1 106 por \CUR1, \S5 107 108 paddd \S5, \S4 109 pxor \S4, \S7 110 pshufb .LRor8(%rip), \S7 111 112 paddd \S7, \S6 113 pxor \S6, \S5 114 movdqa \S5, \CUR1 115 psrld $25, \S5 116 pslld $7, \CUR1 117 por \CUR1, \S5 118.endm 119 120/* final add & xor for 64 bytes */ 121 .macro WRITE_BACK_64 IN_POS OUT_POS 122 paddd O00, S00 123 paddd O01, S01 124 paddd O02, S02 125 paddd O03, S03 126 127 movdqu (\IN_POS), %xmm4 // get input 128 movdqu 16(\IN_POS), %xmm9 129 movdqu 32(\IN_POS), %xmm10 130 movdqu 48(\IN_POS), %xmm11 131 132 pxor %xmm4, S00 133 pxor %xmm9, S01 134 pxor %xmm10, S02 135 pxor %xmm11, S03 136 137 movdqu S00, (\OUT_POS) // write back output 138 movdqu S01, 16(\OUT_POS) 139 movdqu S02, 32(\OUT_POS) 140 movdqu S03, 48(\OUT_POS) 141.endm 142 143/* final add & xor for 128 bytes */ 144.macro WRITE_BACK_128 IN_POS OUT_POS 145 paddd O00, S00 // state 0 + origin state 0 146 paddd O01, S01 147 paddd O02, S02 148 paddd O03, S03 149 150 pinsrd $0, %r11d, O03 // change Original state 0 to Original state 1 151 152 paddd O00, S10 // state 1 + origin state 1 153 paddd O01, S11 154 paddd O02, S12 155 paddd O03, S13 156 157 movdqu (\IN_POS), %xmm4 // get input 0 158 movdqu 16(\IN_POS), %xmm9 159 movdqu 32(\IN_POS), %xmm10 160 movdqu 48(\IN_POS), %xmm11 161 162 pxor %xmm4, S00 // input 0 ^ state 0 163 pxor %xmm9, S01 164 pxor %xmm10, S02 165 pxor %xmm11, S03 166 167 movdqu S00, (\OUT_POS) // write back to output 0 168 movdqu S01, 16(\OUT_POS) 169 movdqu S02, 32(\OUT_POS) 170 movdqu S03, 48(\OUT_POS) 171 172 movdqu 64(\IN_POS), %xmm4 // get input 1 173 movdqu 80(\IN_POS), %xmm9 174 movdqu 96(\IN_POS), %xmm10 175 movdqu 112(\IN_POS), %xmm11 176 177 pxor %xmm4, S10 // input 1 ^ state 1 178 pxor %xmm9, S11 179 pxor %xmm10, S12 180 pxor %xmm11, S13 181 182 movdqu S10, 64(\OUT_POS) // write back to output 1 183 movdqu S11, 80(\OUT_POS) 184 movdqu S12, 96(\OUT_POS) 185 movdqu S13, 112(\OUT_POS) 186.endm 187 188.macro GENERATE_1_STATE 189 add $1, %r11d 190 pinsrd $0, %r11d, O03 191 192 movdqu O00, S00 // set state 0 193 movdqu O01, S01 194 movdqu O02, S02 195 movdqu O03, S03 196.endm 197 198.macro GENERATE_2_STATE 199 add $1, %r11d 200 pinsrd $0, %r11d, O03 201 202 movdqu O00, S00 // set state 0 203 movdqu O01, S01 204 movdqu O02, S02 205 movdqu O03, S03 206 movdqu O00, S10 // set state 1 207 movdqu O01, S11 208 movdqu O02, S12 209 movdqu O03, S13 210 211 add $1, %r11d 212 pinsrd $0, %r11d, S13 213.endm 214 215/* 216 * Processing 64 bytes: 4 xmm registers 217 * xmm0 ~ xmm3: 218 * xmm0 {0, 1, 2, 3} 219 * xmm1 {4, 5, 6, 7} 220 * xmm2 {8, 9, 10, 11} 221 * xmm3 {12, 13, 14, 15} 222 * 223 * Processing 128 bytes: 8 xmm registers 224 * xmm0 ~ xmm8: 225 * xmm0 {0, 1, 2, 3} xmm5 {0, 1, 2, 3} 226 * xmm1 {4, 5, 6, 7} xmm6 {4, 5, 6, 7} 227 * xmm2 {8, 9, 10, 11} xmm7 {8, 9, 10, 11} 228 * xmm3 {12, 13, 14, 15} xmm8 {12, 13, 14, 15} 229 * 230 * Processing 256 bytes: 16 xmm registers 231 * xmm0 ~ xmm15: 232 * xmm0 {0, 0, 0, 0} 233 * xmm1 {1, 2, 2, 2} 234 * xmm2 {3, 3, 3, 3} 235 * xmm3 {4, 4, 4, 4} 236 * ... 237 * xmm15 {15, 15, 15, 15} 238 * 239 * Processing 512 bytes: 16 xmm registers 240 * ymm0 ~ ymm15: 241 * ymm0 {0, 0, 0, 0} 242 * ymm1 {1, 2, 2, 2} 243 * ymm2 {3, 3, 3, 3} 244 * ymm3 {4, 4, 4, 4} 245 * ... 246 * ymm15 {15, 15, 15, 15} 247 * 248 */ 249 250 /** 251 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len); 252 * @brief chacha20 algorithm 253 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred. 254 * @param in [IN] Data to be encrypted 255 * @param out [OUT] Data after encryption 256 * @param len [IN] Encrypted length 257 * esp cannot use 15 available ctx in out len 258 * 16 registers are needed in one cycle, then 259 * {0, 1, 4, 5, 8, 9, 12, 13} 260 * {2, 3, 6, 7, 10, 11, 14, 15} 261**/ 262 263.globl CHACHA20_Update 264.type CHACHA20_Update,%function 265.align 64 266CHACHA20_Update: 267 .cfi_startproc 268 push %r12 269 mov %rcx, %r12 270 mov 48(%rdi), %r11d 271 mov %rsi, IN 272 mov %rdx, OUT 273 274 movdqu (%rdi), O00 // state[0-3] 275 movdqu 16(%rdi), O01 // state[4-7] 276 movdqu 32(%rdi), O02 // state[8-11] 277 movdqu 48(%rdi), O03 // state[12-15] 278 279 sub $1, %r11d 280 281.LChaCha20_start: 282 cmp $128, %r12 283 jae .LChaCha20_128_start 284 cmp $64, %r12 285 jae .LChaCha20_64_start 286 jmp .LChaCha20_end 287 288.LChaCha20_64_start: 289 GENERATE_1_STATE 290 mov $10, %r8 291 292.LChaCha20_64_loop: 293 294 sub $1, %r8 295 296 /* 0 = 0 + 4, 12 = (12 ^ 0) >>> 16 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 12 | 0 = 0 + 4, 12 = (12 ^ 0) >>> 8 | 8 = 8 + 12, 4 = (4 ^ 8) >>> 7 */ 297 /* 1 = 1 + 5, 13 = (13 ^ 1) >>> 16 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 12 | 1 = 1 + 5, 13 = (13 ^ 1) >>> 8 | 9 = 9 + 13, 5 = (5 ^ 9) >>> 7 */ 298 /* 2 = 2 + 6, 14 = (14 ^ 2) >>> 16 | 10 = 10 + 14, 6 = (6 ^ 10)>>> 12 | 2 = 2 + 6, 14 = (14 ^ 2) >>> 8 | 10 =10+ 14, 6 = (6 ^ 10)>>> 7 */ 299 /* 3 = 3 + 7, 15 = (15 ^ 3) >>> 16 | 11 = 11 + 15, 7 = (7 ^ 11)>>> 12 | 3 = 3 + 7 ,15 = (15 ^ 3) >>> 8 | 11 =11+ 15, 7 = (7 ^ 11)>>> 7 */ 300 CHACHA20_ROUND S00 S01 S02 S03 %xmm4 301 302 pshufd $78, S02, S02 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 303 pshufd $57, S01, S01 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 304 pshufd $147, S03, S03 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 305 306 /* 0 = 0 + 5 , 15 = (15 ^ 0) >>> 16 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 12 | 0 = 0 + 5, 15 = (15 ^ 0) >>> 8 | 10 = 10 + 15, 5 = (5 ^ 10) >>> 7 */ 307 /* 1 = 1 + 6 , 12 = (12 ^ 1) >>> 16 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 12 | 1 = 1 + 6, 12 = (12 ^ 1) >>> 8 | 11 = 11 + 12, 6 = (6 ^ 11) >>> 7 */ 308 /* 2 = 2 + 7 , 13 = (13 ^ 2) >>> 16 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 12 | 2 = 2 + 7, 13 = (13 ^ 2) >>> 8 | 8 = 8 + 13, 7 = (7 ^ 8)>>> 7 */ 309 /* 3 = 3 + 4 , 14 = (14 ^ 3) >>> 16 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 12 | 3 = 3 + 4, 14 = (14 ^ 3) >>> 8 | 9 = 9 + 14, 4 = (4 ^ 9)>>> 7 */ 310 CHACHA20_ROUND S00 S01 S02 S03 %xmm4 311 pshufd $78, S02, S02 // {10 11 8 9} ==> {8 9 10 11} 01 00 11 10 312 pshufd $147, S01, S01 // {5 6 7 4} ==> {4 5 6 7} 00 11 10 01 313 pshufd $57, S03, S03 // {15 12 13 14} ==> {12 13 14 15} 10 01 00 11 314 315 jnz .LChaCha20_64_loop 316 317 WRITE_BACK_64 IN OUT 318 319 add $64, IN 320 add $64, OUT 321 322 sub $64, %r12 323 jmp .LChaCha20_start 324 325.LChaCha20_128_start: 326 GENERATE_2_STATE 327 mov $10, %r8 328 329.LChaCha20_128_loop: 330 331 CHACHA20_2_ROUND S00 S01 S02 S03 %xmm4 S10 S11 S12 S13 %xmm9 332 333 pshufd $78, S02, S02 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 334 pshufd $57, S01, S01 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 335 pshufd $147, S03, S03 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 336 337 pshufd $78, S12, S12 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 338 pshufd $57, S11, S11 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 339 pshufd $147, S13, S13 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 340 341 CHACHA20_2_ROUND S00 S01 S02 S03 %xmm4 S10 S11 S12 S13 %xmm9 342 343 pshufd $78, S02, S02 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 344 pshufd $147, S01, S01 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 345 pshufd $57, S03, S03 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 346 347 pshufd $78, S12, S12 // {8 9 10 11} ==> {10 11 8 9} 01 00 11 10 348 pshufd $147, S11, S11 // {4 5 6 7} ==> {5 6 7 4} 00 11 10 01 349 pshufd $57, S13, S13 // {12 13 14 15} ==> {15 12 13 14} 10 01 00 11 350 351 sub $1, %r8 352 jnz .LChaCha20_128_loop 353 354 WRITE_BACK_128 IN OUT 355 add $128, IN 356 add $128, OUT 357 358 sub $128, %r12 359 jmp .LChaCha20_start 360 361.LChaCha20_end: 362 add $1, %r11d 363 mov %r11d, 48(%rdi) 364 pop %r12 365 ret 366 .cfi_endproc 367 368.size CHACHA20_Update,.-CHACHA20_Update 369 370#endif 371