1 /* 2 * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4 * 5 * Licensed under the OpenSSL license (the "License"). You may not use 6 * this file except in compliance with the License. You can obtain a copy 7 * in the file LICENSE in the source distribution or at 8 * https://www.openssl.org/source/license.html 9 * 10 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11 * (1) Intel Corporation, Israel Development Center, Haifa, Israel 12 * (2) University of Haifa, Israel 13 */ 14 15 #include <openssl/opensslconf.h> 16 #include "rsaz_exp.h" 17 18 #ifndef RSAZ_ENABLED 19 NON_EMPTY_TRANSLATION_UNIT 20 #else 21 22 /* 23 * See crypto/bn/asm/rsaz-avx2.pl for further details. 24 */ 25 void rsaz_1024_norm2red_avx2(void *red, const void *norm); 26 void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, 27 const void *n, BN_ULONG k); 28 void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k, 29 int cnt); 30 void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i); 31 void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i); 32 void rsaz_1024_red2norm_avx2(void *norm, const void *red); 33 34 #if defined(__GNUC__) 35 # define ALIGN64 __attribute__((aligned(64))) 36 #elif defined(_MSC_VER) 37 # define ALIGN64 __declspec(align(64)) 38 #elif defined(__SUNPRO_C) 39 # define ALIGN64 40 # pragma align 64(one,two80) 41 #else 42 /* not fatal, might hurt performance a little */ 43 # define ALIGN64 44 #endif 45 46 ALIGN64 static const BN_ULONG one[40] = { 47 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 49 }; 50 51 ALIGN64 static const BN_ULONG two80[40] = { 52 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 54 }; 55 56 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], 57 const BN_ULONG base_norm[16], 58 const BN_ULONG exponent[16], 59 const BN_ULONG m_norm[16], const BN_ULONG RR[16], 60 BN_ULONG k0) 61 { 62 unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */ 63 unsigned char *p_str = storage + (64 - ((size_t)storage % 64)); 64 unsigned char *a_inv, *m, *result; 65 unsigned char *table_s = p_str + 320 * 3; 66 unsigned char *R2 = table_s; /* borrow */ 67 int index; 68 int wvalue; 69 70 if ((((size_t)p_str & 4095) + 320) >> 12) { 71 result = p_str; 72 a_inv = p_str + 320; 73 m = p_str + 320 * 2; /* should not cross page */ 74 } else { 75 m = p_str; /* should not cross page */ 76 result = p_str + 320; 77 a_inv = p_str + 320 * 2; 78 } 79 80 rsaz_1024_norm2red_avx2(m, m_norm); 81 rsaz_1024_norm2red_avx2(a_inv, base_norm); 82 rsaz_1024_norm2red_avx2(R2, RR); 83 84 rsaz_1024_mul_avx2(R2, R2, R2, m, k0); 85 rsaz_1024_mul_avx2(R2, R2, two80, m, k0); 86 87 /* table[0] = 1 */ 88 rsaz_1024_mul_avx2(result, R2, one, m, k0); 89 /* table[1] = a_inv^1 */ 90 rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); 91 92 rsaz_1024_scatter5_avx2(table_s, result, 0); 93 rsaz_1024_scatter5_avx2(table_s, a_inv, 1); 94 95 /* table[2] = a_inv^2 */ 96 rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); 97 rsaz_1024_scatter5_avx2(table_s, result, 2); 98 #if 0 99 /* this is almost 2x smaller and less than 1% slower */ 100 for (index = 3; index < 32; index++) { 101 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 102 rsaz_1024_scatter5_avx2(table_s, result, index); 103 } 104 #else 105 /* table[4] = a_inv^4 */ 106 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 107 rsaz_1024_scatter5_avx2(table_s, result, 4); 108 /* table[8] = a_inv^8 */ 109 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 110 rsaz_1024_scatter5_avx2(table_s, result, 8); 111 /* table[16] = a_inv^16 */ 112 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 113 rsaz_1024_scatter5_avx2(table_s, result, 16); 114 /* table[17] = a_inv^17 */ 115 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 116 rsaz_1024_scatter5_avx2(table_s, result, 17); 117 118 /* table[3] */ 119 rsaz_1024_gather5_avx2(result, table_s, 2); 120 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 121 rsaz_1024_scatter5_avx2(table_s, result, 3); 122 /* table[6] */ 123 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 124 rsaz_1024_scatter5_avx2(table_s, result, 6); 125 /* table[12] */ 126 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 127 rsaz_1024_scatter5_avx2(table_s, result, 12); 128 /* table[24] */ 129 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 130 rsaz_1024_scatter5_avx2(table_s, result, 24); 131 /* table[25] */ 132 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 133 rsaz_1024_scatter5_avx2(table_s, result, 25); 134 135 /* table[5] */ 136 rsaz_1024_gather5_avx2(result, table_s, 4); 137 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 138 rsaz_1024_scatter5_avx2(table_s, result, 5); 139 /* table[10] */ 140 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 141 rsaz_1024_scatter5_avx2(table_s, result, 10); 142 /* table[20] */ 143 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 144 rsaz_1024_scatter5_avx2(table_s, result, 20); 145 /* table[21] */ 146 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 147 rsaz_1024_scatter5_avx2(table_s, result, 21); 148 149 /* table[7] */ 150 rsaz_1024_gather5_avx2(result, table_s, 6); 151 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 152 rsaz_1024_scatter5_avx2(table_s, result, 7); 153 /* table[14] */ 154 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 155 rsaz_1024_scatter5_avx2(table_s, result, 14); 156 /* table[28] */ 157 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 158 rsaz_1024_scatter5_avx2(table_s, result, 28); 159 /* table[29] */ 160 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 161 rsaz_1024_scatter5_avx2(table_s, result, 29); 162 163 /* table[9] */ 164 rsaz_1024_gather5_avx2(result, table_s, 8); 165 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 166 rsaz_1024_scatter5_avx2(table_s, result, 9); 167 /* table[18] */ 168 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 169 rsaz_1024_scatter5_avx2(table_s, result, 18); 170 /* table[19] */ 171 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 172 rsaz_1024_scatter5_avx2(table_s, result, 19); 173 174 /* table[11] */ 175 rsaz_1024_gather5_avx2(result, table_s, 10); 176 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 177 rsaz_1024_scatter5_avx2(table_s, result, 11); 178 /* table[22] */ 179 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 180 rsaz_1024_scatter5_avx2(table_s, result, 22); 181 /* table[23] */ 182 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 183 rsaz_1024_scatter5_avx2(table_s, result, 23); 184 185 /* table[13] */ 186 rsaz_1024_gather5_avx2(result, table_s, 12); 187 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 188 rsaz_1024_scatter5_avx2(table_s, result, 13); 189 /* table[26] */ 190 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 191 rsaz_1024_scatter5_avx2(table_s, result, 26); 192 /* table[27] */ 193 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 194 rsaz_1024_scatter5_avx2(table_s, result, 27); 195 196 /* table[15] */ 197 rsaz_1024_gather5_avx2(result, table_s, 14); 198 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 199 rsaz_1024_scatter5_avx2(table_s, result, 15); 200 /* table[30] */ 201 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 202 rsaz_1024_scatter5_avx2(table_s, result, 30); 203 /* table[31] */ 204 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 205 rsaz_1024_scatter5_avx2(table_s, result, 31); 206 #endif 207 208 /* load first window */ 209 p_str = (unsigned char *)exponent; 210 wvalue = p_str[127] >> 3; 211 rsaz_1024_gather5_avx2(result, table_s, wvalue); 212 213 index = 1014; 214 215 while (index > -1) { /* loop for the remaining 127 windows */ 216 217 rsaz_1024_sqr_avx2(result, result, m, k0, 5); 218 219 wvalue = (p_str[(index / 8) + 1] << 8) | p_str[index / 8]; 220 wvalue = (wvalue >> (index % 8)) & 31; 221 index -= 5; 222 223 rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ 224 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 225 } 226 227 /* square four times */ 228 rsaz_1024_sqr_avx2(result, result, m, k0, 4); 229 230 wvalue = p_str[0] & 15; 231 232 rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ 233 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 234 235 /* from Montgomery */ 236 rsaz_1024_mul_avx2(result, result, one, m, k0); 237 238 rsaz_1024_red2norm_avx2(result_norm, result); 239 240 OPENSSL_cleanse(storage, sizeof(storage)); 241 } 242 243 /* 244 * See crypto/bn/rsaz-x86_64.pl for further details. 245 */ 246 void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n, 247 BN_ULONG k); 248 void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n, 249 BN_ULONG k, const void *tbl, unsigned int power); 250 void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl, 251 const void *n, BN_ULONG k, unsigned int power); 252 void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k); 253 void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k, 254 int cnt); 255 void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power); 256 void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power); 257 258 void RSAZ_512_mod_exp(BN_ULONG result[8], 259 const BN_ULONG base[8], const BN_ULONG exponent[8], 260 const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) 261 { 262 unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */ 263 unsigned char *table = storage + (64 - ((size_t)storage % 64)); 264 BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8); 265 BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8); 266 unsigned char *p_str = (unsigned char *)exponent; 267 int index; 268 unsigned int wvalue; 269 270 /* table[0] = 1_inv */ 271 temp[0] = 0 - m[0]; 272 temp[1] = ~m[1]; 273 temp[2] = ~m[2]; 274 temp[3] = ~m[3]; 275 temp[4] = ~m[4]; 276 temp[5] = ~m[5]; 277 temp[6] = ~m[6]; 278 temp[7] = ~m[7]; 279 rsaz_512_scatter4(table, temp, 0); 280 281 /* table [1] = a_inv^1 */ 282 rsaz_512_mul(a_inv, base, RR, m, k0); 283 rsaz_512_scatter4(table, a_inv, 1); 284 285 /* table [2] = a_inv^2 */ 286 rsaz_512_sqr(temp, a_inv, m, k0, 1); 287 rsaz_512_scatter4(table, temp, 2); 288 289 for (index = 3; index < 16; index++) 290 rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); 291 292 /* load first window */ 293 wvalue = p_str[63]; 294 295 rsaz_512_gather4(temp, table, wvalue >> 4); 296 rsaz_512_sqr(temp, temp, m, k0, 4); 297 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf); 298 299 for (index = 62; index >= 0; index--) { 300 wvalue = p_str[index]; 301 302 rsaz_512_sqr(temp, temp, m, k0, 4); 303 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4); 304 305 rsaz_512_sqr(temp, temp, m, k0, 4); 306 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f); 307 } 308 309 /* from Montgomery */ 310 rsaz_512_mul_by_one(result, temp, m, k0); 311 312 OPENSSL_cleanse(storage, sizeof(storage)); 313 } 314 315 #endif 316