1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_GCM 18 19.file "ghash_x86_64.S" 20.text 21 22.set INL, %xmm11 23.set INH, %xmm12 24.set INM, %xmm13 25.set HKEY3, %xmm14 26.set HKEY4, %xmm15 27 28.set INPUT_XI, %rdi 29.set HTABLE, %rsi 30.set INPUT_IN, %rdx 31.set LEN, %rcx 32.set XI_L, %xmm0 33.set XI_H, %xmm1 34.set HKEY, %xmm2 35 36.set IN_L, %xmm3 37.set IN_H, %xmm4 38.set IN_M, %xmm5 39.set HKEY2, %xmm6 40.set HKEY1_2, %xmm7 41.set TEMP1, %xmm8 42.set TEMP2, %xmm9 43.set MASK, %xmm10 44 45.balign 16 46g_bswapMask: 47 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 48.size g_bswapMask, .-g_bswapMask 49.balign 16 50g_polynomial: 51 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc2 52.size g_polynomial, .-g_polynomial 53.balign 16 54g_64swapMask: 55 .byte 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 56.size g_64swapMask, .-g_64swapMask 57.balign 16 58g_poly: 59 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 60 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2 61.size g_poly, .-g_poly 62 63/** 64 * Macro description: one block * H (128bit * 128bit) 65 * Input registers: xl, hKey, hKey12 66 * Change registers: temp1 and temp2 67 * Result register: xh, xl 68 */ 69.macro GHASH_MUL128X128 xh, xl, hKey, hKey12, temp1, temp2 70 vpshufd $0x4e, \xl, \temp1 71 vpclmulqdq $0x11, \hKey, \xl, \xh 72 vpxor \xl, \temp1, \temp1 73 74 vpclmulqdq $0x00, \hKey, \xl, \xl 75 vpxor \xl, \xh, \temp2 76 vpclmulqdq $0x00, \hKey12, \temp1, \temp1 77 vpxor \temp2, \temp1, \temp1 78 79 vpslldq $8, \temp1, \temp2 80 vpsrldq $8, \temp1, \temp1 81 vpxor \temp1, \xh, \xh 82 vpxor \temp2, \xl, \xl 83.endm 84 85/** 86 * Macro description: 256-bit large number reduction modulo g(x) 87 * Input register: xh, xl 88 * Change registers: temp1 and temp2 89 * Result register: xl 90 */ 91.macro REDUCTION_256BIT xh, xl, temp1, temp2, reducMask 92 vmovdqa \reducMask(%rip), \temp1 // g_poly 93 vpalignr $8, \xl, \xl, \temp2 // 1st phase of reduction 94 vpclmulqdq $0x10, \temp1, \xl, \xl 95 vpxor \temp2, \xl, \xl 96 97 vpalignr $8, \xl, \xl, \temp2 // 2nd phase of reduction 98 vpclmulqdq $0x10, \temp1, \xl, \xl 99 vpxor \xh, \temp2, \temp2 100 vpxor \temp2, \xl, \xl 101.endm 102 103/** 104 * Function description: x86_64 hTable pre-computation table implementation (H has been transformed) 105 * Function prototype: void GcmTableGen4bit(uint8_t key[GCM_BLOCKSIZE], MODES_GCM_GF128 hTable[16]); 106 * Input register: 107 * rdi: uint8_t key[GCM_BLOCKSIZE] 108 * rsi: MODES_GCM_GF128 hTable[16] 109 * Change register: xmm0-xmm15 110 * Function/Macro Call: 111 * GHASH_MUL128X128 112 * REDUCTION_256BIT 113 */ 114.align 32 115.globl GcmTableGen4bit 116.type GcmTableGen4bit, %function 117GcmTableGen4bit: 118.cfi_startproc 119 vmovdqu (INPUT_XI), HKEY 120 vpshufb g_64swapMask(%rip), HKEY, HKEY 121 vpshufd $0x4e, HKEY, IN_L 122 vpshufd $0x55, HKEY, HKEY // broadcast carry bit 123 vmovdqa g_polynomial(%rip), IN_H 124 125 vpsrlq $63, IN_L, IN_M 126 vpxor MASK, MASK, MASK 127 vpcmpgtd HKEY, MASK, HKEY 128 vpand IN_H, IN_M, IN_M 129 vpsllq $1, IN_L, IN_L 130 131 vpshufd $0x4e, IN_M, IN_M 132 133 vpand HKEY, IN_H, IN_H 134 vpor IN_M, IN_L, IN_L // H<<<=1 135 vpxor IN_L, IN_H, HKEY // twisted H 136 137 vmovdqu HKEY, (HTABLE) // store in H[0] 138 vpshufd $0x4e, HKEY, HKEY1_2 139 vpxor HKEY, HKEY1_2, HKEY1_2 140 vmovdqa HKEY, XI_L 141 /* xh, xl, hKey, hKey12, temp1, temp2 */ 142 GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2 // calculate H^2 143 /* xh, xl, temp1, temp2, reducMask */ 144 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 145 vmovdqa XI_L, HKEY2 146 GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2 // calculate H^3 147 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 148 vmovdqa XI_L, HKEY3 149 GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2 // calculate H^4 150 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 151 vmovdqa XI_L, HKEY4 152 vmovdqu HKEY2, 0x10(HTABLE) // store H^2 in H[1] 153 vmovdqu HKEY3, 0x30(HTABLE) // store H^3 in H[3] 154 vmovdqu HKEY4, 0x40(HTABLE) // store H^4 in H[4] 155 156 vpshufd $0x4e, HKEY2, TEMP1 157 vpxor HKEY2, TEMP1, TEMP1 158 vshufps $0x44, TEMP1, HKEY1_2, HKEY1_2 159 vmovdqu HKEY1_2, 0x20(HTABLE) // store [H^2.h + H^2.l, H.h + H.l] in H[2] 160 161 vpshufd $0x4e, HKEY3, TEMP1 162 vpshufd $0x4e, HKEY4, TEMP2 163 vpxor HKEY3, TEMP1, TEMP1 164 vpxor HKEY4, TEMP2, TEMP2 165 vshufps $0x44, TEMP2, TEMP1, HKEY1_2 166 vmovdqu HKEY1_2, 0x50(HTABLE) // store [H^4.h + H^4.l, H^3.h + H^3.l] in H[5] 167 168 vmovdqu 0x20(HTABLE), HKEY1_2 // reload [H^2.h + H^2.l, H.h + H.l] 169 GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2 // calculate H^5, for aes-gcm 170 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 171 vmovdqa XI_L, HKEY3 172 GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2 // calculate H^6, for aes-gcm 173 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 174 vmovdqa XI_L, HKEY4 175 vmovdqu HKEY3, 0x60(HTABLE) // store H^5 in H[6] 176 vmovdqu HKEY4, 0x70(HTABLE) // store H^6 in H[7] 177 vpshufd $0x4e, HKEY3, TEMP1 178 vpshufd $0x4e, HKEY4, TEMP2 179 vpxor HKEY3, TEMP1, TEMP1 180 vpxor HKEY4, TEMP2, TEMP2 181 vshufps $0x44, TEMP2, TEMP1, HKEY1_2 182 vmovdqu HKEY1_2, 0x80(HTABLE) // store [H^6.h + H^6.l, H^5.h + H^5.l] in H[8] 183 184 vpxor HKEY, HKEY, HKEY // clear hTable 185 vpxor HKEY1_2, HKEY1_2, HKEY1_2 186 vpxor HKEY2, HKEY2, HKEY2 187 vpxor HKEY3, HKEY3, HKEY3 188 vpxor HKEY4, HKEY4, HKEY4 189 ret 190.cfi_endproc 191.size GcmTableGen4bit, .-GcmTableGen4bit 192 193/** 194 * Function description: x86_64 ghash assembly acceleration implementation 195 * Function prototype: void GcmHashMultiBlock(uint8_t t[GCM_BLOCKSIZE], const MODES_GCM_GF128 hTable[16], 196 * const uint8_t *in, uint32_t inLen); 197 * Input register: 198 * rdi: uint8_t t[GCM_BLOCKSIZE] 199 * rsi: const MODES_GCM_GF128 hTable[16] 200 * rdx: const uint8_t *in 201 * rcx: uint32_t inLen 202 * Change register: xmm0-xmm15 203 * Function/Macro Call: 204 * GHASH_MUL128X128 205 * REDUCTION_256BIT // reduction modulo g(x) 206 */ 207.align 32 208.globl GcmHashMultiBlock 209.type GcmHashMultiBlock, %function 210GcmHashMultiBlock: 211.cfi_startproc 212 vmovdqa g_bswapMask(%rip), MASK 213 vmovdqu (INPUT_XI), XI_L 214 vmovdqu (HTABLE), HKEY 215 vmovdqu 0x20(HTABLE), HKEY1_2 216 vpshufb MASK, XI_L, XI_L 217 218 cmp $0x10, LEN 219 je .Lremain_1block 220 221 vmovdqu 0x10(HTABLE), HKEY2 222 cmp $0x40, LEN 223 jae .Lmul_4blocks 224 jmp .Lremain_Least_2blocks 225 226.align 32 227.Lmul_4blocks: 228 subq $0x40, LEN 229 230 vmovdqu 0x30(INPUT_IN), IN_L // load In_3, In_2 231 vmovdqu 0x20(INPUT_IN), INL 232 vpshufb MASK, IN_L, IN_L 233 vpshufb MASK, INL, INL 234 235 vmovdqa IN_L, IN_H // H * In_3 236 vpshufd $0x4e, IN_L, IN_M 237 vpxor IN_L, IN_M, IN_M 238 vpclmulqdq $0x00, HKEY, IN_L, IN_L 239 vpclmulqdq $0x11, HKEY, IN_H, IN_H 240 vpclmulqdq $0x00, HKEY1_2, IN_M, IN_M 241 242 vmovdqa INL, INH // H^2 * In_2 243 vpshufd $0x4e, INL, INM 244 vpxor INL, INM, INM 245 vpclmulqdq $0x00, HKEY2, INL, INL 246 vpclmulqdq $0x11, HKEY2, INH, INH 247 vpclmulqdq $0x10, HKEY1_2, INM, INM 248 vxorps INL, IN_L, IN_L // H * In_3 + H^2 * In_2 249 vxorps INH, IN_H, IN_H 250 vxorps INM, IN_M, IN_M 251 252 vmovdqu 0x30(HTABLE), HKEY3 253 vmovdqu 0x40(HTABLE), HKEY4 254 vmovdqu 0x50(HTABLE), HKEY1_2 255 256 vmovdqu 0x10(INPUT_IN), INL // load In_1, In_0 257 vmovdqu (INPUT_IN), TEMP1 258 vpshufb MASK, INL, INL 259 vpshufb MASK, TEMP1, TEMP1 260 261 vmovdqa INL, INH // H^3 * In_1 262 vpshufd $0x4e, INL, INM 263 vpxor INL, INM, INM 264 vpclmulqdq $0x00, HKEY3, INL, INL 265 vpclmulqdq $0x11, HKEY3, INH, INH 266 vpclmulqdq $0x00, HKEY1_2, INM, INM 267 vxorps INL, IN_L, IN_L // H * In_3 + H^2 * In_2 + H^3 * In_1 268 vxorps INH, IN_H, IN_H 269 vxorps INM, IN_M, IN_M 270 271 vpxor TEMP1, XI_L, XI_L // (In_1 + Xi) 272 vmovdqa XI_L, XI_H 273 vpshufd $0x4e, XI_L, TEMP1 274 vpxor XI_L, TEMP1, TEMP1 275 vpclmulqdq $0x00, HKEY4, XI_L, XI_L // H^4 * (In_1 + Xi) 276 vpclmulqdq $0x11, HKEY4, XI_H, XI_H 277 vpclmulqdq $0x10, HKEY1_2, TEMP1, TEMP1 278 vxorps IN_L, XI_L, XI_L // H * In_3 + H^2 * In_2 + H^3 * In_1 + H^4 * (In_1 + Xi) 279 vxorps IN_H, XI_H, XI_H 280 vxorps IN_M, TEMP1, TEMP1 281 282 vpxor XI_L, TEMP1, TEMP1 283 vpxor XI_H, TEMP1, TEMP1 284 vmovdqa TEMP1, TEMP2 285 vpslldq $8, TEMP1, TEMP1 286 vpsrldq $8, TEMP2, TEMP2 287 vpxor TEMP1, XI_L, XI_L 288 vpxor TEMP2, XI_H, XI_H 289 290 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 291 cmp $0x00, LEN 292 jz .Lend // finshed all blocks 293 leaq 0x40(INPUT_IN), INPUT_IN 294 vmovdqu 0x20(HTABLE), HKEY1_2 295 cmp $0x40, LEN 296 jae .Lmul_4blocks 297 cmp $0x20, LEN 298 jae .Lremain_Least_2blocks 299 jmp .Lremain_1block 300 301.align 32 302.Lremain_Least_2blocks: 303 subq $0x20, LEN 304 vmovdqu 0x10(INPUT_IN), IN_L // loda (4 * i) + 1 or 2 block 305 vmovdqu (INPUT_IN), TEMP1 306 vpshufb MASK, IN_L, IN_L 307 vpshufb MASK, TEMP1, TEMP1 308 vpxor TEMP1, XI_L, XI_L 309 310 vmovdqa IN_L, IN_H 311 vpshufd $0x4e, IN_L, IN_M 312 vpxor IN_L, IN_M, IN_M 313 vpclmulqdq $0x00, HKEY, IN_L, IN_L 314 vpclmulqdq $0x11, HKEY, IN_H, IN_H 315 vpclmulqdq $0x00, HKEY1_2, IN_M, IN_M 316 317 vmovdqa XI_L, XI_H 318 vpshufd $0x4e, XI_L, TEMP1 319 vpxor XI_L, TEMP1, TEMP1 320 vpclmulqdq $0x00, HKEY2, XI_L, XI_L 321 vpclmulqdq $0x11, HKEY2, XI_H, XI_H 322 vpclmulqdq $0x10, HKEY1_2, TEMP1, TEMP1 323 vxorps IN_L, XI_L, XI_L 324 vxorps IN_H, XI_H, XI_H 325 vxorps IN_M, TEMP1, TEMP1 326 327 vpxor XI_L, TEMP1, TEMP1 328 vpxor XI_H, TEMP1, TEMP1 329 vmovdqa TEMP1, TEMP2 330 vpslldq $8, TEMP1, TEMP1 331 vpsrldq $8, TEMP2, TEMP2 332 vpxor TEMP1, XI_L, XI_L 333 vpxor TEMP2, XI_H, XI_H 334 335 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 336 cmp $0x00, LEN 337 jz .Lend 338 leaq 0x20(INPUT_IN), INPUT_IN 339 340.align 32 341.Lremain_1block: 342 subq $0x10, LEN 343 vmovdqu (INPUT_IN), TEMP1 344 vpshufb MASK, TEMP1, TEMP1 345 vpxor TEMP1, XI_L, XI_L 346 347 GHASH_MUL128X128 XI_H, XI_L, HKEY, HKEY1_2, TEMP1, TEMP2 348 REDUCTION_256BIT XI_H, XI_L, TEMP1, TEMP2, g_poly 349 350.Lend: 351 vpshufb MASK, XI_L, XI_L 352 vmovdqu XI_L, (INPUT_XI) 353 vpxor HKEY, HKEY, HKEY // clear hTable 354 vpxor HKEY1_2, HKEY1_2, HKEY1_2 355 vpxor HKEY2, HKEY2, HKEY2 356 vpxor HKEY3, HKEY3, HKEY3 357 vpxor HKEY4, HKEY4, HKEY4 358 ret 359.cfi_endproc 360.size GcmHashMultiBlock, .-GcmHashMultiBlock 361 362#endif 363