1 /* 2 * Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright 2014 Cryptography Research, Inc. 4 * 5 * Licensed under the Apache License 2.0 (the "License"). You may not use 6 * this file except in compliance with the License. You can obtain a copy 7 * in the file LICENSE in the source distribution or at 8 * https://www.openssl.org/source/license.html 9 * 10 * Originally written by Mike Hamburg 11 */ 12 13 #include "e_os.h" 14 #include <openssl/macros.h> 15 #include "internal/numbers.h" 16 17 #ifndef UINT128_MAX 18 /* No support for 128 bit ints, so do nothing here */ 19 NON_EMPTY_TRANSLATION_UNIT 20 #else 21 22 # include "../field.h" 23 24 void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) 25 { 26 const uint64_t *a = as->limb, *b = bs->limb; 27 uint64_t *c = cs->limb; 28 uint128_t accum0 = 0, accum1 = 0, accum2; 29 uint64_t mask = (1ULL << 56) - 1; 30 uint64_t aa[4], bb[4], bbb[4]; 31 unsigned int i, j; 32 33 for (i = 0; i < 4; i++) { 34 aa[i] = a[i] + a[i + 4]; 35 bb[i] = b[i] + b[i + 4]; 36 bbb[i] = bb[i] + b[i + 4]; 37 } 38 39 for (i = 0; i < 4; i++) { 40 accum2 = 0; 41 42 for (j = 0; j <= i; j++) { 43 accum2 += widemul(a[j], b[i - j]); 44 accum1 += widemul(aa[j], bb[i - j]); 45 accum0 += widemul(a[j + 4], b[i - j + 4]); 46 } 47 for (; j < 4; j++) { 48 accum2 += widemul(a[j], b[i - j + 8]); 49 accum1 += widemul(aa[j], bbb[i - j + 4]); 50 accum0 += widemul(a[j + 4], bb[i - j + 4]); 51 } 52 53 accum1 -= accum2; 54 accum0 += accum2; 55 56 c[i] = ((uint64_t)(accum0)) & mask; 57 c[i + 4] = ((uint64_t)(accum1)) & mask; 58 59 accum0 >>= 56; 60 accum1 >>= 56; 61 } 62 63 accum0 += accum1; 64 accum0 += c[4]; 65 accum1 += c[0]; 66 c[4] = ((uint64_t)(accum0)) & mask; 67 c[0] = ((uint64_t)(accum1)) & mask; 68 69 accum0 >>= 56; 70 accum1 >>= 56; 71 72 c[5] += ((uint64_t)(accum0)); 73 c[1] += ((uint64_t)(accum1)); 74 } 75 76 void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) 77 { 78 const uint64_t *a = as->limb; 79 uint64_t *c = cs->limb; 80 uint128_t accum0 = 0, accum4 = 0; 81 uint64_t mask = (1ULL << 56) - 1; 82 int i; 83 84 for (i = 0; i < 4; i++) { 85 accum0 += widemul(b, a[i]); 86 accum4 += widemul(b, a[i + 4]); 87 c[i] = accum0 & mask; 88 accum0 >>= 56; 89 c[i + 4] = accum4 & mask; 90 accum4 >>= 56; 91 } 92 93 accum0 += accum4 + c[4]; 94 c[4] = accum0 & mask; 95 c[5] += accum0 >> 56; 96 97 accum4 += c[0]; 98 c[0] = accum4 & mask; 99 c[1] += accum4 >> 56; 100 } 101 102 void gf_sqr(gf_s * RESTRICT cs, const gf as) 103 { 104 const uint64_t *a = as->limb; 105 uint64_t *c = cs->limb; 106 uint128_t accum0 = 0, accum1 = 0, accum2; 107 uint64_t mask = (1ULL << 56) - 1; 108 uint64_t aa[4]; 109 unsigned int i; 110 111 /* For some reason clang doesn't vectorize this without prompting? */ 112 for (i = 0; i < 4; i++) 113 aa[i] = a[i] + a[i + 4]; 114 115 accum2 = widemul(a[0], a[3]); 116 accum0 = widemul(aa[0], aa[3]); 117 accum1 = widemul(a[4], a[7]); 118 119 accum2 += widemul(a[1], a[2]); 120 accum0 += widemul(aa[1], aa[2]); 121 accum1 += widemul(a[5], a[6]); 122 123 accum0 -= accum2; 124 accum1 += accum2; 125 126 c[3] = ((uint64_t)(accum1)) << 1 & mask; 127 c[7] = ((uint64_t)(accum0)) << 1 & mask; 128 129 accum0 >>= 55; 130 accum1 >>= 55; 131 132 accum0 += widemul(2 * aa[1], aa[3]); 133 accum1 += widemul(2 * a[5], a[7]); 134 accum0 += widemul(aa[2], aa[2]); 135 accum1 += accum0; 136 137 accum0 -= widemul(2 * a[1], a[3]); 138 accum1 += widemul(a[6], a[6]); 139 140 accum2 = widemul(a[0], a[0]); 141 accum1 -= accum2; 142 accum0 += accum2; 143 144 accum0 -= widemul(a[2], a[2]); 145 accum1 += widemul(aa[0], aa[0]); 146 accum0 += widemul(a[4], a[4]); 147 148 c[0] = ((uint64_t)(accum0)) & mask; 149 c[4] = ((uint64_t)(accum1)) & mask; 150 151 accum0 >>= 56; 152 accum1 >>= 56; 153 154 accum2 = widemul(2 * aa[2], aa[3]); 155 accum0 -= widemul(2 * a[2], a[3]); 156 accum1 += widemul(2 * a[6], a[7]); 157 158 accum1 += accum2; 159 accum0 += accum2; 160 161 accum2 = widemul(2 * a[0], a[1]); 162 accum1 += widemul(2 * aa[0], aa[1]); 163 accum0 += widemul(2 * a[4], a[5]); 164 165 accum1 -= accum2; 166 accum0 += accum2; 167 168 c[1] = ((uint64_t)(accum0)) & mask; 169 c[5] = ((uint64_t)(accum1)) & mask; 170 171 accum0 >>= 56; 172 accum1 >>= 56; 173 174 accum2 = widemul(aa[3], aa[3]); 175 accum0 -= widemul(a[3], a[3]); 176 accum1 += widemul(a[7], a[7]); 177 178 accum1 += accum2; 179 accum0 += accum2; 180 181 accum2 = widemul(2 * a[0], a[2]); 182 accum1 += widemul(2 * aa[0], aa[2]); 183 accum0 += widemul(2 * a[4], a[6]); 184 185 accum2 += widemul(a[1], a[1]); 186 accum1 += widemul(aa[1], aa[1]); 187 accum0 += widemul(a[5], a[5]); 188 189 accum1 -= accum2; 190 accum0 += accum2; 191 192 c[2] = ((uint64_t)(accum0)) & mask; 193 c[6] = ((uint64_t)(accum1)) & mask; 194 195 accum0 >>= 56; 196 accum1 >>= 56; 197 198 accum0 += c[3]; 199 accum1 += c[7]; 200 c[3] = ((uint64_t)(accum0)) & mask; 201 c[7] = ((uint64_t)(accum1)) & mask; 202 203 /* we could almost stop here, but it wouldn't be stable, so... */ 204 205 accum0 >>= 56; 206 accum1 >>= 56; 207 c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); 208 c[0] += ((uint64_t)(accum1)); 209 } 210 #endif 211