1/* Copyright 2019 The BoringSSL Authors 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14 15#include <openssl/base.h> 16 17#include "../../internal.h" 18#include "internal.h" 19 20#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2) 21#include <emmintrin.h> 22#endif 23 24 25// This file contains a constant-time implementation of GHASH based on the notes 26// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction 27// algorithm described in 28// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. 29// 30// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our 31// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run 32// the 32-bit implementation, but we can use its intrinsics if necessary. 33 34#if defined(BORINGSSL_HAS_UINT128) 35 36static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, 37 uint64_t b) { 38 // One term every four bits means the largest term is 64/4 = 16, which barely 39 // overflows into the next term. Using one term every five bits would cost 25 40 // multiplications instead of 16. It is faster to mask off the bottom four 41 // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits 42 // separately. 43 uint64_t a0 = a & UINT64_C(0x1111111111111110); 44 uint64_t a1 = a & UINT64_C(0x2222222222222220); 45 uint64_t a2 = a & UINT64_C(0x4444444444444440); 46 uint64_t a3 = a & UINT64_C(0x8888888888888880); 47 48 uint64_t b0 = b & UINT64_C(0x1111111111111111); 49 uint64_t b1 = b & UINT64_C(0x2222222222222222); 50 uint64_t b2 = b & UINT64_C(0x4444444444444444); 51 uint64_t b3 = b & UINT64_C(0x8888888888888888); 52 53 uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^ 54 (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1); 55 uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^ 56 (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2); 57 uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^ 58 (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3); 59 uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^ 60 (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0); 61 62 // Multiply the bottom four bits of |a| with |b|. 63 uint64_t a0_mask = UINT64_C(0) - (a & 1); 64 uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1); 65 uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1); 66 uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1); 67 uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^ 68 ((uint128_t)(a2_mask & b) << 2) ^ 69 ((uint128_t)(a3_mask & b) << 3); 70 71 *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^ 72 (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^ 73 (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^ 74 (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra); 75 *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^ 76 (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^ 77 (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^ 78 (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^ 79 ((uint64_t)(extra >> 64)); 80} 81 82#elif defined(OPENSSL_SSE2) 83 84static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) { 85 // One term every four bits means the largest term is 32/4 = 8, which does not 86 // overflow into the next term. 87 __m128i aa = _mm_setr_epi32(a, 0, a, 0); 88 __m128i bb = _mm_setr_epi32(b, 0, b, 0); 89 90 __m128i a0a0 = 91 _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0)); 92 __m128i a2a2 = 93 _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0)); 94 __m128i b0b1 = 95 _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0)); 96 __m128i b2b3 = 97 _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0)); 98 99 __m128i c0c1 = 100 _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3)); 101 __m128i c2c3 = 102 _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3)); 103 104 __m128i a1a1 = 105 _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0)); 106 __m128i a3a3 = 107 _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0)); 108 __m128i b3b0 = 109 _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0)); 110 __m128i b1b2 = 111 _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0)); 112 113 c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0)); 114 c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2)); 115 c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0)); 116 c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2)); 117 118 c0c1 = _mm_and_si128( 119 c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222)); 120 c2c3 = _mm_and_si128( 121 c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888)); 122 123 c0c1 = _mm_xor_si128(c0c1, c2c3); 124 // c0 ^= c1 125 c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8)); 126 return c0c1; 127} 128 129static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, 130 uint64_t b) { 131 uint32_t a0 = a & 0xffffffff; 132 uint32_t a1 = a >> 32; 133 uint32_t b0 = b & 0xffffffff; 134 uint32_t b1 = b >> 32; 135 // Karatsuba multiplication. 136 __m128i lo = gcm_mul32_nohw(a0, b0); 137 __m128i hi = gcm_mul32_nohw(a1, b1); 138 __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1); 139 mid = _mm_xor_si128(mid, lo); 140 mid = _mm_xor_si128(mid, hi); 141 __m128i ret = _mm_unpacklo_epi64(lo, hi); 142 mid = _mm_slli_si128(mid, 4); 143 mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0)); 144 ret = _mm_xor_si128(ret, mid); 145 memcpy(out_lo, &ret, 8); 146 memcpy(out_hi, ((char*)&ret) + 8, 8); 147} 148 149#else // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2 150 151static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) { 152 // One term every four bits means the largest term is 32/4 = 8, which does not 153 // overflow into the next term. 154 uint32_t a0 = a & 0x11111111; 155 uint32_t a1 = a & 0x22222222; 156 uint32_t a2 = a & 0x44444444; 157 uint32_t a3 = a & 0x88888888; 158 159 uint32_t b0 = b & 0x11111111; 160 uint32_t b1 = b & 0x22222222; 161 uint32_t b2 = b & 0x44444444; 162 uint32_t b3 = b & 0x88888888; 163 164 uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^ 165 (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1); 166 uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^ 167 (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2); 168 uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^ 169 (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3); 170 uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^ 171 (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0); 172 173 return (c0 & UINT64_C(0x1111111111111111)) | 174 (c1 & UINT64_C(0x2222222222222222)) | 175 (c2 & UINT64_C(0x4444444444444444)) | 176 (c3 & UINT64_C(0x8888888888888888)); 177} 178 179static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, 180 uint64_t b) { 181 uint32_t a0 = a & 0xffffffff; 182 uint32_t a1 = a >> 32; 183 uint32_t b0 = b & 0xffffffff; 184 uint32_t b1 = b >> 32; 185 // Karatsuba multiplication. 186 uint64_t lo = gcm_mul32_nohw(a0, b0); 187 uint64_t hi = gcm_mul32_nohw(a1, b1); 188 uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi; 189 *out_lo = lo ^ (mid << 32); 190 *out_hi = hi ^ (mid >> 32); 191} 192 193#endif // BORINGSSL_HAS_UINT128 194 195void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) { 196 // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This 197 // avoids a shift by 1 in the multiplication, needed to account for bit 198 // reversal losing a bit after multiplication, that is, 199 // rev128(X) * rev128(Y) = rev255(X*Y). 200 // 201 // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation 202 // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped. 203 // 204 // See also slide 16 of 205 // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf 206 Htable[0].lo = Xi[1]; 207 Htable[0].hi = Xi[0]; 208 209 uint64_t carry = Htable[0].hi >> 63; 210 carry = 0u - carry; 211 212 Htable[0].hi <<= 1; 213 Htable[0].hi |= Htable[0].lo >> 63; 214 Htable[0].lo <<= 1; 215 216 // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we 217 // conditionally add 0xc200...0001. 218 Htable[0].lo ^= carry & 1; 219 Htable[0].hi ^= carry & UINT64_C(0xc200000000000000); 220 221 // This implementation does not use the rest of |Htable|. 222} 223 224static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) { 225 // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0| 226 // through |r3|. Note there is no byte or bit reversal because we are 227 // evaluating POLYVAL. 228 uint64_t r0, r1; 229 gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo); 230 uint64_t r2, r3; 231 gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi); 232 uint64_t mid0, mid1; 233 gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo); 234 mid0 ^= r0 ^ r2; 235 mid1 ^= r1 ^ r3; 236 r2 ^= mid1; 237 r1 ^= mid0; 238 239 // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and 240 // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We 241 // have: 242 // 243 // 1 = x^121 + x^126 + x^127 + x^128 244 // x^-128 = x^-7 + x^-2 + x^-1 + 1 245 // 246 // This is the GHASH reduction step, but with bits flowing in reverse. 247 248 // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require 249 // another reduction steps. Instead, we gather the excess bits, incorporate 250 // them into |r0| and |r1| and reduce once. See slides 17-19 251 // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. 252 r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57); 253 254 // 1 255 r2 ^= r0; 256 r3 ^= r1; 257 258 // x^-1 259 r2 ^= r0 >> 1; 260 r2 ^= r1 << 63; 261 r3 ^= r1 >> 1; 262 263 // x^-2 264 r2 ^= r0 >> 2; 265 r2 ^= r1 << 62; 266 r3 ^= r1 >> 2; 267 268 // x^-7 269 r2 ^= r0 >> 7; 270 r2 ^= r1 << 57; 271 r3 ^= r1 >> 7; 272 273 Xi[0] = r2; 274 Xi[1] = r3; 275} 276 277void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) { 278 uint64_t swapped[2]; 279 swapped[0] = CRYPTO_load_u64_be(Xi + 8); 280 swapped[1] = CRYPTO_load_u64_be(Xi); 281 gcm_polyval_nohw(swapped, &Htable[0]); 282 CRYPTO_store_u64_be(Xi, swapped[1]); 283 CRYPTO_store_u64_be(Xi + 8, swapped[0]); 284} 285 286void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, 287 size_t len) { 288 uint64_t swapped[2]; 289 swapped[0] = CRYPTO_load_u64_be(Xi + 8); 290 swapped[1] = CRYPTO_load_u64_be(Xi); 291 292 while (len >= 16) { 293 swapped[0] ^= CRYPTO_load_u64_be(inp + 8); 294 swapped[1] ^= CRYPTO_load_u64_be(inp); 295 gcm_polyval_nohw(swapped, &Htable[0]); 296 inp += 16; 297 len -= 16; 298 } 299 300 CRYPTO_store_u64_be(Xi, swapped[1]); 301 CRYPTO_store_u64_be(Xi + 8, swapped[0]); 302} 303