• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2014, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 // This implementation of poly1305 is by Andrew Moon
16 // (https://github.com/floodyberry/poly1305-donna) and released as public
17 // domain. It implements SIMD vectorization based on the algorithm described in
18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 // block size
20 
21 #include <ring-core/poly1305.h>
22 
23 #include "internal.h"
24 #include "../internal.h"
25 
26 
27 #if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
28 
29 #pragma GCC diagnostic ignored "-Wcast-align"
30 #pragma GCC diagnostic ignored "-Wsign-conversion"
31 
32 #include <emmintrin.h>
33 
load_u32_le(const uint8_t in[4])34 static uint32_t load_u32_le(const uint8_t in[4]) {
35   uint32_t ret;
36   OPENSSL_memcpy(&ret, in, 4);
37   return ret;
38 }
39 
load_u64_le(const uint8_t in[8])40 static uint64_t load_u64_le(const uint8_t in[8]) {
41   uint64_t ret;
42   OPENSSL_memcpy(&ret, in, 8);
43   return ret;
44 }
45 
store_u64_le(uint8_t out[8],uint64_t v)46 static void store_u64_le(uint8_t out[8], uint64_t v) {
47   OPENSSL_memcpy(out, &v, 8);
48 }
49 
50 typedef __m128i xmmi;
51 
52 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
53     (1 << 26) - 1, 0, (1 << 26) - 1, 0};
54 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
55 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
56     (1 << 24), 0, (1 << 24), 0};
57 
add128(uint128_t a,uint128_t b)58 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
59 
add128_64(uint128_t a,uint64_t b)60 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
61 
mul64x64_128(uint64_t a,uint64_t b)62 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
63   return (uint128_t)a * b;
64 }
65 
lo128(uint128_t a)66 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
67 
shr128(uint128_t v,const int shift)68 static inline uint64_t shr128(uint128_t v, const int shift) {
69   return (uint64_t)(v >> shift);
70 }
71 
shr128_pair(uint64_t hi,uint64_t lo,const int shift)72 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
73   return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
74 }
75 
76 typedef struct poly1305_power_t {
77   union {
78     xmmi v;
79     uint64_t u[2];
80     uint32_t d[4];
81   } R20, R21, R22, R23, R24, S21, S22, S23, S24;
82 } poly1305_power;
83 
84 typedef struct poly1305_state_internal_t {
85   poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
86                           bytes of free storage */
87   union {
88     xmmi H[5];  //  80 bytes
89     uint64_t HH[10];
90   };
91   // uint64_t r0,r1,r2;       [24 bytes]
92   // uint64_t pad0,pad1;      [16 bytes]
93   uint64_t started;        //   8 bytes
94   uint64_t leftover;       //   8 bytes
95   uint8_t buffer[64];      //  64 bytes
96 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
97                               alignment = 511 bytes raw */
98 
99 OPENSSL_STATIC_ASSERT(sizeof(poly1305_state_internal) <= sizeof(poly1305_state),
100      "poly1305_state isn't large enough to hold aligned poly1305_state_internal");
101 
poly1305_aligned_state(poly1305_state * state)102 static inline poly1305_state_internal *poly1305_aligned_state(
103     poly1305_state *state) {
104   dev_assert_secret(((uintptr_t)state & 63) == 0);
105   return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
106 }
107 
poly1305_min(size_t a,size_t b)108 static inline size_t poly1305_min(size_t a, size_t b) {
109   return (a < b) ? a : b;
110 }
111 
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])112 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
113   poly1305_state_internal *st = poly1305_aligned_state(state);
114   poly1305_power *p;
115   uint64_t r0, r1, r2;
116   uint64_t t0, t1;
117 
118   // clamp key
119   t0 = load_u64_le(key + 0);
120   t1 = load_u64_le(key + 8);
121   r0 = t0 & 0xffc0fffffff;
122   t0 >>= 44;
123   t0 |= t1 << 20;
124   r1 = t0 & 0xfffffc0ffff;
125   t1 >>= 24;
126   r2 = t1 & 0x00ffffffc0f;
127 
128   // store r in un-used space of st->P[1]
129   p = &st->P[1];
130   p->R20.d[1] = (uint32_t)(r0);
131   p->R20.d[3] = (uint32_t)(r0 >> 32);
132   p->R21.d[1] = (uint32_t)(r1);
133   p->R21.d[3] = (uint32_t)(r1 >> 32);
134   p->R22.d[1] = (uint32_t)(r2);
135   p->R22.d[3] = (uint32_t)(r2 >> 32);
136 
137   // store pad
138   p->R23.d[1] = load_u32_le(key + 16);
139   p->R23.d[3] = load_u32_le(key + 20);
140   p->R24.d[1] = load_u32_le(key + 24);
141   p->R24.d[3] = load_u32_le(key + 28);
142 
143   // H = 0
144   st->H[0] = _mm_setzero_si128();
145   st->H[1] = _mm_setzero_si128();
146   st->H[2] = _mm_setzero_si128();
147   st->H[3] = _mm_setzero_si128();
148   st->H[4] = _mm_setzero_si128();
149 
150   st->started = 0;
151   st->leftover = 0;
152 }
153 
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)154 static void poly1305_first_block(poly1305_state_internal *st,
155                                  const uint8_t *m) {
156   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
157   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
158   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
159   xmmi T5, T6;
160   poly1305_power *p;
161   uint128_t d[3];
162   uint64_t r0, r1, r2;
163   uint64_t r20, r21, r22, s22;
164   uint64_t pad0, pad1;
165   uint64_t c;
166   uint64_t i;
167 
168   // pull out stored info
169   p = &st->P[1];
170 
171   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
172   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
173   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
174   pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
175   pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
176 
177   // compute powers r^2,r^4
178   r20 = r0;
179   r21 = r1;
180   r22 = r2;
181   for (i = 0; i < 2; i++) {
182     s22 = r22 * (5 << 2);
183 
184     d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
185     d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
186     d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
187 
188     r20 = lo128(d[0]) & 0xfffffffffff;
189     c = shr128(d[0], 44);
190     d[1] = add128_64(d[1], c);
191     r21 = lo128(d[1]) & 0xfffffffffff;
192     c = shr128(d[1], 44);
193     d[2] = add128_64(d[2], c);
194     r22 = lo128(d[2]) & 0x3ffffffffff;
195     c = shr128(d[2], 42);
196     r20 += c * 5;
197     c = (r20 >> 44);
198     r20 = r20 & 0xfffffffffff;
199     r21 += c;
200 
201     p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
202                                  _MM_SHUFFLE(1, 0, 1, 0));
203     p->R21.v = _mm_shuffle_epi32(
204         _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
205         _MM_SHUFFLE(1, 0, 1, 0));
206     p->R22.v =
207         _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
208                           _MM_SHUFFLE(1, 0, 1, 0));
209     p->R23.v = _mm_shuffle_epi32(
210         _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
211         _MM_SHUFFLE(1, 0, 1, 0));
212     p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
213                                  _MM_SHUFFLE(1, 0, 1, 0));
214     p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
215     p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
216     p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
217     p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
218     p--;
219   }
220 
221   // put saved info back
222   p = &st->P[1];
223   p->R20.d[1] = (uint32_t)(r0);
224   p->R20.d[3] = (uint32_t)(r0 >> 32);
225   p->R21.d[1] = (uint32_t)(r1);
226   p->R21.d[3] = (uint32_t)(r1 >> 32);
227   p->R22.d[1] = (uint32_t)(r2);
228   p->R22.d[3] = (uint32_t)(r2 >> 32);
229   p->R23.d[1] = (uint32_t)(pad0);
230   p->R23.d[3] = (uint32_t)(pad0 >> 32);
231   p->R24.d[1] = (uint32_t)(pad1);
232   p->R24.d[3] = (uint32_t)(pad1 >> 32);
233 
234   // H = [Mx,My]
235   T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
236                           _mm_loadl_epi64((const xmmi *)(m + 16)));
237   T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
238                           _mm_loadl_epi64((const xmmi *)(m + 24)));
239   st->H[0] = _mm_and_si128(MMASK, T5);
240   st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
241   T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
242   st->H[2] = _mm_and_si128(MMASK, T5);
243   st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
244   st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
245 }
246 
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)247 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
248                             size_t bytes) {
249   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
250   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
251   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
252 
253   poly1305_power *p;
254   xmmi H0, H1, H2, H3, H4;
255   xmmi T0, T1, T2, T3, T4, T5, T6;
256   xmmi M0, M1, M2, M3, M4;
257   xmmi C1, C2;
258 
259   H0 = st->H[0];
260   H1 = st->H[1];
261   H2 = st->H[2];
262   H3 = st->H[3];
263   H4 = st->H[4];
264 
265   while (bytes >= 64) {
266     // H *= [r^4,r^4]
267     p = &st->P[0];
268     T0 = _mm_mul_epu32(H0, p->R20.v);
269     T1 = _mm_mul_epu32(H0, p->R21.v);
270     T2 = _mm_mul_epu32(H0, p->R22.v);
271     T3 = _mm_mul_epu32(H0, p->R23.v);
272     T4 = _mm_mul_epu32(H0, p->R24.v);
273     T5 = _mm_mul_epu32(H1, p->S24.v);
274     T6 = _mm_mul_epu32(H1, p->R20.v);
275     T0 = _mm_add_epi64(T0, T5);
276     T1 = _mm_add_epi64(T1, T6);
277     T5 = _mm_mul_epu32(H2, p->S23.v);
278     T6 = _mm_mul_epu32(H2, p->S24.v);
279     T0 = _mm_add_epi64(T0, T5);
280     T1 = _mm_add_epi64(T1, T6);
281     T5 = _mm_mul_epu32(H3, p->S22.v);
282     T6 = _mm_mul_epu32(H3, p->S23.v);
283     T0 = _mm_add_epi64(T0, T5);
284     T1 = _mm_add_epi64(T1, T6);
285     T5 = _mm_mul_epu32(H4, p->S21.v);
286     T6 = _mm_mul_epu32(H4, p->S22.v);
287     T0 = _mm_add_epi64(T0, T5);
288     T1 = _mm_add_epi64(T1, T6);
289     T5 = _mm_mul_epu32(H1, p->R21.v);
290     T6 = _mm_mul_epu32(H1, p->R22.v);
291     T2 = _mm_add_epi64(T2, T5);
292     T3 = _mm_add_epi64(T3, T6);
293     T5 = _mm_mul_epu32(H2, p->R20.v);
294     T6 = _mm_mul_epu32(H2, p->R21.v);
295     T2 = _mm_add_epi64(T2, T5);
296     T3 = _mm_add_epi64(T3, T6);
297     T5 = _mm_mul_epu32(H3, p->S24.v);
298     T6 = _mm_mul_epu32(H3, p->R20.v);
299     T2 = _mm_add_epi64(T2, T5);
300     T3 = _mm_add_epi64(T3, T6);
301     T5 = _mm_mul_epu32(H4, p->S23.v);
302     T6 = _mm_mul_epu32(H4, p->S24.v);
303     T2 = _mm_add_epi64(T2, T5);
304     T3 = _mm_add_epi64(T3, T6);
305     T5 = _mm_mul_epu32(H1, p->R23.v);
306     T4 = _mm_add_epi64(T4, T5);
307     T5 = _mm_mul_epu32(H2, p->R22.v);
308     T4 = _mm_add_epi64(T4, T5);
309     T5 = _mm_mul_epu32(H3, p->R21.v);
310     T4 = _mm_add_epi64(T4, T5);
311     T5 = _mm_mul_epu32(H4, p->R20.v);
312     T4 = _mm_add_epi64(T4, T5);
313 
314     // H += [Mx,My]*[r^2,r^2]
315     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
316                             _mm_loadl_epi64((const xmmi *)(m + 16)));
317     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
318                             _mm_loadl_epi64((const xmmi *)(m + 24)));
319     M0 = _mm_and_si128(MMASK, T5);
320     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
321     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
322     M2 = _mm_and_si128(MMASK, T5);
323     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
324     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
325 
326     p = &st->P[1];
327     T5 = _mm_mul_epu32(M0, p->R20.v);
328     T6 = _mm_mul_epu32(M0, p->R21.v);
329     T0 = _mm_add_epi64(T0, T5);
330     T1 = _mm_add_epi64(T1, T6);
331     T5 = _mm_mul_epu32(M1, p->S24.v);
332     T6 = _mm_mul_epu32(M1, p->R20.v);
333     T0 = _mm_add_epi64(T0, T5);
334     T1 = _mm_add_epi64(T1, T6);
335     T5 = _mm_mul_epu32(M2, p->S23.v);
336     T6 = _mm_mul_epu32(M2, p->S24.v);
337     T0 = _mm_add_epi64(T0, T5);
338     T1 = _mm_add_epi64(T1, T6);
339     T5 = _mm_mul_epu32(M3, p->S22.v);
340     T6 = _mm_mul_epu32(M3, p->S23.v);
341     T0 = _mm_add_epi64(T0, T5);
342     T1 = _mm_add_epi64(T1, T6);
343     T5 = _mm_mul_epu32(M4, p->S21.v);
344     T6 = _mm_mul_epu32(M4, p->S22.v);
345     T0 = _mm_add_epi64(T0, T5);
346     T1 = _mm_add_epi64(T1, T6);
347     T5 = _mm_mul_epu32(M0, p->R22.v);
348     T6 = _mm_mul_epu32(M0, p->R23.v);
349     T2 = _mm_add_epi64(T2, T5);
350     T3 = _mm_add_epi64(T3, T6);
351     T5 = _mm_mul_epu32(M1, p->R21.v);
352     T6 = _mm_mul_epu32(M1, p->R22.v);
353     T2 = _mm_add_epi64(T2, T5);
354     T3 = _mm_add_epi64(T3, T6);
355     T5 = _mm_mul_epu32(M2, p->R20.v);
356     T6 = _mm_mul_epu32(M2, p->R21.v);
357     T2 = _mm_add_epi64(T2, T5);
358     T3 = _mm_add_epi64(T3, T6);
359     T5 = _mm_mul_epu32(M3, p->S24.v);
360     T6 = _mm_mul_epu32(M3, p->R20.v);
361     T2 = _mm_add_epi64(T2, T5);
362     T3 = _mm_add_epi64(T3, T6);
363     T5 = _mm_mul_epu32(M4, p->S23.v);
364     T6 = _mm_mul_epu32(M4, p->S24.v);
365     T2 = _mm_add_epi64(T2, T5);
366     T3 = _mm_add_epi64(T3, T6);
367     T5 = _mm_mul_epu32(M0, p->R24.v);
368     T4 = _mm_add_epi64(T4, T5);
369     T5 = _mm_mul_epu32(M1, p->R23.v);
370     T4 = _mm_add_epi64(T4, T5);
371     T5 = _mm_mul_epu32(M2, p->R22.v);
372     T4 = _mm_add_epi64(T4, T5);
373     T5 = _mm_mul_epu32(M3, p->R21.v);
374     T4 = _mm_add_epi64(T4, T5);
375     T5 = _mm_mul_epu32(M4, p->R20.v);
376     T4 = _mm_add_epi64(T4, T5);
377 
378     // H += [Mx,My]
379     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
380                             _mm_loadl_epi64((const xmmi *)(m + 48)));
381     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
382                             _mm_loadl_epi64((const xmmi *)(m + 56)));
383     M0 = _mm_and_si128(MMASK, T5);
384     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
385     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
386     M2 = _mm_and_si128(MMASK, T5);
387     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
388     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
389 
390     T0 = _mm_add_epi64(T0, M0);
391     T1 = _mm_add_epi64(T1, M1);
392     T2 = _mm_add_epi64(T2, M2);
393     T3 = _mm_add_epi64(T3, M3);
394     T4 = _mm_add_epi64(T4, M4);
395 
396     // reduce
397     C1 = _mm_srli_epi64(T0, 26);
398     C2 = _mm_srli_epi64(T3, 26);
399     T0 = _mm_and_si128(T0, MMASK);
400     T3 = _mm_and_si128(T3, MMASK);
401     T1 = _mm_add_epi64(T1, C1);
402     T4 = _mm_add_epi64(T4, C2);
403     C1 = _mm_srli_epi64(T1, 26);
404     C2 = _mm_srli_epi64(T4, 26);
405     T1 = _mm_and_si128(T1, MMASK);
406     T4 = _mm_and_si128(T4, MMASK);
407     T2 = _mm_add_epi64(T2, C1);
408     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
409     C1 = _mm_srli_epi64(T2, 26);
410     C2 = _mm_srli_epi64(T0, 26);
411     T2 = _mm_and_si128(T2, MMASK);
412     T0 = _mm_and_si128(T0, MMASK);
413     T3 = _mm_add_epi64(T3, C1);
414     T1 = _mm_add_epi64(T1, C2);
415     C1 = _mm_srli_epi64(T3, 26);
416     T3 = _mm_and_si128(T3, MMASK);
417     T4 = _mm_add_epi64(T4, C1);
418 
419     // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
420     H0 = T0;
421     H1 = T1;
422     H2 = T2;
423     H3 = T3;
424     H4 = T4;
425 
426     m += 64;
427     bytes -= 64;
428   }
429 
430   st->H[0] = H0;
431   st->H[1] = H1;
432   st->H[2] = H2;
433   st->H[3] = H3;
434   st->H[4] = H4;
435 }
436 
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)437 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
438                                size_t bytes) {
439   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
440   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
441   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
442 
443   poly1305_power *p;
444   xmmi H0, H1, H2, H3, H4;
445   xmmi M0, M1, M2, M3, M4;
446   xmmi T0, T1, T2, T3, T4, T5, T6;
447   xmmi C1, C2;
448 
449   uint64_t r0, r1, r2;
450   uint64_t t0, t1, t2, t3, t4;
451   uint64_t c;
452   size_t consumed = 0;
453 
454   H0 = st->H[0];
455   H1 = st->H[1];
456   H2 = st->H[2];
457   H3 = st->H[3];
458   H4 = st->H[4];
459 
460   // p = [r^2,r^2]
461   p = &st->P[1];
462 
463   if (bytes >= 32) {
464     // H *= [r^2,r^2]
465     T0 = _mm_mul_epu32(H0, p->R20.v);
466     T1 = _mm_mul_epu32(H0, p->R21.v);
467     T2 = _mm_mul_epu32(H0, p->R22.v);
468     T3 = _mm_mul_epu32(H0, p->R23.v);
469     T4 = _mm_mul_epu32(H0, p->R24.v);
470     T5 = _mm_mul_epu32(H1, p->S24.v);
471     T6 = _mm_mul_epu32(H1, p->R20.v);
472     T0 = _mm_add_epi64(T0, T5);
473     T1 = _mm_add_epi64(T1, T6);
474     T5 = _mm_mul_epu32(H2, p->S23.v);
475     T6 = _mm_mul_epu32(H2, p->S24.v);
476     T0 = _mm_add_epi64(T0, T5);
477     T1 = _mm_add_epi64(T1, T6);
478     T5 = _mm_mul_epu32(H3, p->S22.v);
479     T6 = _mm_mul_epu32(H3, p->S23.v);
480     T0 = _mm_add_epi64(T0, T5);
481     T1 = _mm_add_epi64(T1, T6);
482     T5 = _mm_mul_epu32(H4, p->S21.v);
483     T6 = _mm_mul_epu32(H4, p->S22.v);
484     T0 = _mm_add_epi64(T0, T5);
485     T1 = _mm_add_epi64(T1, T6);
486     T5 = _mm_mul_epu32(H1, p->R21.v);
487     T6 = _mm_mul_epu32(H1, p->R22.v);
488     T2 = _mm_add_epi64(T2, T5);
489     T3 = _mm_add_epi64(T3, T6);
490     T5 = _mm_mul_epu32(H2, p->R20.v);
491     T6 = _mm_mul_epu32(H2, p->R21.v);
492     T2 = _mm_add_epi64(T2, T5);
493     T3 = _mm_add_epi64(T3, T6);
494     T5 = _mm_mul_epu32(H3, p->S24.v);
495     T6 = _mm_mul_epu32(H3, p->R20.v);
496     T2 = _mm_add_epi64(T2, T5);
497     T3 = _mm_add_epi64(T3, T6);
498     T5 = _mm_mul_epu32(H4, p->S23.v);
499     T6 = _mm_mul_epu32(H4, p->S24.v);
500     T2 = _mm_add_epi64(T2, T5);
501     T3 = _mm_add_epi64(T3, T6);
502     T5 = _mm_mul_epu32(H1, p->R23.v);
503     T4 = _mm_add_epi64(T4, T5);
504     T5 = _mm_mul_epu32(H2, p->R22.v);
505     T4 = _mm_add_epi64(T4, T5);
506     T5 = _mm_mul_epu32(H3, p->R21.v);
507     T4 = _mm_add_epi64(T4, T5);
508     T5 = _mm_mul_epu32(H4, p->R20.v);
509     T4 = _mm_add_epi64(T4, T5);
510 
511     // H += [Mx,My]
512     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
513                             _mm_loadl_epi64((const xmmi *)(m + 16)));
514     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
515                             _mm_loadl_epi64((const xmmi *)(m + 24)));
516     M0 = _mm_and_si128(MMASK, T5);
517     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
518     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
519     M2 = _mm_and_si128(MMASK, T5);
520     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
521     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
522 
523     T0 = _mm_add_epi64(T0, M0);
524     T1 = _mm_add_epi64(T1, M1);
525     T2 = _mm_add_epi64(T2, M2);
526     T3 = _mm_add_epi64(T3, M3);
527     T4 = _mm_add_epi64(T4, M4);
528 
529     // reduce
530     C1 = _mm_srli_epi64(T0, 26);
531     C2 = _mm_srli_epi64(T3, 26);
532     T0 = _mm_and_si128(T0, MMASK);
533     T3 = _mm_and_si128(T3, MMASK);
534     T1 = _mm_add_epi64(T1, C1);
535     T4 = _mm_add_epi64(T4, C2);
536     C1 = _mm_srli_epi64(T1, 26);
537     C2 = _mm_srli_epi64(T4, 26);
538     T1 = _mm_and_si128(T1, MMASK);
539     T4 = _mm_and_si128(T4, MMASK);
540     T2 = _mm_add_epi64(T2, C1);
541     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
542     C1 = _mm_srli_epi64(T2, 26);
543     C2 = _mm_srli_epi64(T0, 26);
544     T2 = _mm_and_si128(T2, MMASK);
545     T0 = _mm_and_si128(T0, MMASK);
546     T3 = _mm_add_epi64(T3, C1);
547     T1 = _mm_add_epi64(T1, C2);
548     C1 = _mm_srli_epi64(T3, 26);
549     T3 = _mm_and_si128(T3, MMASK);
550     T4 = _mm_add_epi64(T4, C1);
551 
552     // H = (H*[r^2,r^2] + [Mx,My])
553     H0 = T0;
554     H1 = T1;
555     H2 = T2;
556     H3 = T3;
557     H4 = T4;
558 
559     consumed = 32;
560   }
561 
562   // finalize, H *= [r^2,r]
563   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
564   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
565   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
566 
567   p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
568   p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
569   p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
570   p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
571   p->R24.d[2] = (uint32_t)((r2 >> 16));
572   p->S21.d[2] = p->R21.d[2] * 5;
573   p->S22.d[2] = p->R22.d[2] * 5;
574   p->S23.d[2] = p->R23.d[2] * 5;
575   p->S24.d[2] = p->R24.d[2] * 5;
576 
577   // H *= [r^2,r]
578   T0 = _mm_mul_epu32(H0, p->R20.v);
579   T1 = _mm_mul_epu32(H0, p->R21.v);
580   T2 = _mm_mul_epu32(H0, p->R22.v);
581   T3 = _mm_mul_epu32(H0, p->R23.v);
582   T4 = _mm_mul_epu32(H0, p->R24.v);
583   T5 = _mm_mul_epu32(H1, p->S24.v);
584   T6 = _mm_mul_epu32(H1, p->R20.v);
585   T0 = _mm_add_epi64(T0, T5);
586   T1 = _mm_add_epi64(T1, T6);
587   T5 = _mm_mul_epu32(H2, p->S23.v);
588   T6 = _mm_mul_epu32(H2, p->S24.v);
589   T0 = _mm_add_epi64(T0, T5);
590   T1 = _mm_add_epi64(T1, T6);
591   T5 = _mm_mul_epu32(H3, p->S22.v);
592   T6 = _mm_mul_epu32(H3, p->S23.v);
593   T0 = _mm_add_epi64(T0, T5);
594   T1 = _mm_add_epi64(T1, T6);
595   T5 = _mm_mul_epu32(H4, p->S21.v);
596   T6 = _mm_mul_epu32(H4, p->S22.v);
597   T0 = _mm_add_epi64(T0, T5);
598   T1 = _mm_add_epi64(T1, T6);
599   T5 = _mm_mul_epu32(H1, p->R21.v);
600   T6 = _mm_mul_epu32(H1, p->R22.v);
601   T2 = _mm_add_epi64(T2, T5);
602   T3 = _mm_add_epi64(T3, T6);
603   T5 = _mm_mul_epu32(H2, p->R20.v);
604   T6 = _mm_mul_epu32(H2, p->R21.v);
605   T2 = _mm_add_epi64(T2, T5);
606   T3 = _mm_add_epi64(T3, T6);
607   T5 = _mm_mul_epu32(H3, p->S24.v);
608   T6 = _mm_mul_epu32(H3, p->R20.v);
609   T2 = _mm_add_epi64(T2, T5);
610   T3 = _mm_add_epi64(T3, T6);
611   T5 = _mm_mul_epu32(H4, p->S23.v);
612   T6 = _mm_mul_epu32(H4, p->S24.v);
613   T2 = _mm_add_epi64(T2, T5);
614   T3 = _mm_add_epi64(T3, T6);
615   T5 = _mm_mul_epu32(H1, p->R23.v);
616   T4 = _mm_add_epi64(T4, T5);
617   T5 = _mm_mul_epu32(H2, p->R22.v);
618   T4 = _mm_add_epi64(T4, T5);
619   T5 = _mm_mul_epu32(H3, p->R21.v);
620   T4 = _mm_add_epi64(T4, T5);
621   T5 = _mm_mul_epu32(H4, p->R20.v);
622   T4 = _mm_add_epi64(T4, T5);
623 
624   C1 = _mm_srli_epi64(T0, 26);
625   C2 = _mm_srli_epi64(T3, 26);
626   T0 = _mm_and_si128(T0, MMASK);
627   T3 = _mm_and_si128(T3, MMASK);
628   T1 = _mm_add_epi64(T1, C1);
629   T4 = _mm_add_epi64(T4, C2);
630   C1 = _mm_srli_epi64(T1, 26);
631   C2 = _mm_srli_epi64(T4, 26);
632   T1 = _mm_and_si128(T1, MMASK);
633   T4 = _mm_and_si128(T4, MMASK);
634   T2 = _mm_add_epi64(T2, C1);
635   T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
636   C1 = _mm_srli_epi64(T2, 26);
637   C2 = _mm_srli_epi64(T0, 26);
638   T2 = _mm_and_si128(T2, MMASK);
639   T0 = _mm_and_si128(T0, MMASK);
640   T3 = _mm_add_epi64(T3, C1);
641   T1 = _mm_add_epi64(T1, C2);
642   C1 = _mm_srli_epi64(T3, 26);
643   T3 = _mm_and_si128(T3, MMASK);
644   T4 = _mm_add_epi64(T4, C1);
645 
646   // H = H[0]+H[1]
647   H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
648   H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
649   H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
650   H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
651   H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
652 
653   t0 = _mm_cvtsi128_si32(H0);
654   c = (t0 >> 26);
655   t0 &= 0x3ffffff;
656   t1 = _mm_cvtsi128_si32(H1) + c;
657   c = (t1 >> 26);
658   t1 &= 0x3ffffff;
659   t2 = _mm_cvtsi128_si32(H2) + c;
660   c = (t2 >> 26);
661   t2 &= 0x3ffffff;
662   t3 = _mm_cvtsi128_si32(H3) + c;
663   c = (t3 >> 26);
664   t3 &= 0x3ffffff;
665   t4 = _mm_cvtsi128_si32(H4) + c;
666   c = (t4 >> 26);
667   t4 &= 0x3ffffff;
668   t0 = t0 + (c * 5);
669   c = (t0 >> 26);
670   t0 &= 0x3ffffff;
671   t1 = t1 + c;
672 
673   st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
674   st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
675   st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
676 
677   return consumed;
678 }
679 
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)680 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
681                             size_t bytes) {
682   poly1305_state_internal *st = poly1305_aligned_state(state);
683   size_t want;
684 
685   // Work around a C language bug. See https://crbug.com/1019588.
686   if (bytes == 0) {
687     return;
688   }
689 
690   // need at least 32 initial bytes to start the accelerated branch
691   if (!st->started) {
692     if ((st->leftover == 0) && (bytes > 32)) {
693       poly1305_first_block(st, m);
694       m += 32;
695       bytes -= 32;
696     } else {
697       want = poly1305_min(32 - st->leftover, bytes);
698       OPENSSL_memcpy(st->buffer + st->leftover, m, want);
699       bytes -= want;
700       m += want;
701       st->leftover += want;
702       if ((st->leftover < 32) || (bytes == 0)) {
703         return;
704       }
705       poly1305_first_block(st, st->buffer);
706       st->leftover = 0;
707     }
708     st->started = 1;
709   }
710 
711   // handle leftover
712   if (st->leftover) {
713     want = poly1305_min(64 - st->leftover, bytes);
714     OPENSSL_memcpy(st->buffer + st->leftover, m, want);
715     bytes -= want;
716     m += want;
717     st->leftover += want;
718     if (st->leftover < 64) {
719       return;
720     }
721     poly1305_blocks(st, st->buffer, 64);
722     st->leftover = 0;
723   }
724 
725   // process 64 byte blocks
726   if (bytes >= 64) {
727     want = (bytes & ~63);
728     poly1305_blocks(st, m, want);
729     m += want;
730     bytes -= want;
731   }
732 
733   if (bytes) {
734     OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
735     st->leftover += bytes;
736   }
737 }
738 
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])739 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
740   poly1305_state_internal *st = poly1305_aligned_state(state);
741   size_t leftover = st->leftover;
742   uint8_t *m = st->buffer;
743   uint128_t d[3];
744   uint64_t h0, h1, h2;
745   uint64_t t0, t1;
746   uint64_t g0, g1, g2, c, nc;
747   uint64_t r0, r1, r2, s1, s2;
748   poly1305_power *p;
749 
750   if (st->started) {
751     size_t consumed = poly1305_combine(st, m, leftover);
752     leftover -= consumed;
753     m += consumed;
754   }
755 
756   // st->HH will either be 0 or have the combined result
757   h0 = st->HH[0];
758   h1 = st->HH[1];
759   h2 = st->HH[2];
760 
761   p = &st->P[1];
762   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
763   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
764   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
765   s1 = r1 * (5 << 2);
766   s2 = r2 * (5 << 2);
767 
768   if (leftover < 16) {
769     goto poly1305_donna_atmost15bytes;
770   }
771 
772 poly1305_donna_atleast16bytes:
773   t0 = load_u64_le(m + 0);
774   t1 = load_u64_le(m + 8);
775   h0 += t0 & 0xfffffffffff;
776   t0 = shr128_pair(t1, t0, 44);
777   h1 += t0 & 0xfffffffffff;
778   h2 += (t1 >> 24) | ((uint64_t)1 << 40);
779 
780 poly1305_donna_mul:
781   d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
782                 mul64x64_128(h2, s1));
783   d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
784                 mul64x64_128(h2, s2));
785   d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
786                 mul64x64_128(h2, r0));
787   h0 = lo128(d[0]) & 0xfffffffffff;
788   c = shr128(d[0], 44);
789   d[1] = add128_64(d[1], c);
790   h1 = lo128(d[1]) & 0xfffffffffff;
791   c = shr128(d[1], 44);
792   d[2] = add128_64(d[2], c);
793   h2 = lo128(d[2]) & 0x3ffffffffff;
794   c = shr128(d[2], 42);
795   h0 += c * 5;
796 
797   m += 16;
798   leftover -= 16;
799   if (leftover >= 16) {
800     goto poly1305_donna_atleast16bytes;
801   }
802 
803 // final bytes
804 poly1305_donna_atmost15bytes:
805   if (!leftover) {
806     goto poly1305_donna_finish;
807   }
808 
809   m[leftover++] = 1;
810   OPENSSL_memset(m + leftover, 0, 16 - leftover);
811   leftover = 16;
812 
813   t0 = load_u64_le(m + 0);
814   t1 = load_u64_le(m + 8);
815   h0 += t0 & 0xfffffffffff;
816   t0 = shr128_pair(t1, t0, 44);
817   h1 += t0 & 0xfffffffffff;
818   h2 += (t1 >> 24);
819 
820   goto poly1305_donna_mul;
821 
822 poly1305_donna_finish:
823   c = (h0 >> 44);
824   h0 &= 0xfffffffffff;
825   h1 += c;
826   c = (h1 >> 44);
827   h1 &= 0xfffffffffff;
828   h2 += c;
829   c = (h2 >> 42);
830   h2 &= 0x3ffffffffff;
831   h0 += c * 5;
832 
833   g0 = h0 + 5;
834   c = (g0 >> 44);
835   g0 &= 0xfffffffffff;
836   g1 = h1 + c;
837   c = (g1 >> 44);
838   g1 &= 0xfffffffffff;
839   g2 = h2 + c - ((uint64_t)1 << 42);
840 
841   c = (g2 >> 63) - 1;
842   nc = ~c;
843   h0 = (h0 & nc) | (g0 & c);
844   h1 = (h1 & nc) | (g1 & c);
845   h2 = (h2 & nc) | (g2 & c);
846 
847   // pad
848   t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
849   t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
850   h0 += (t0 & 0xfffffffffff);
851   c = (h0 >> 44);
852   h0 &= 0xfffffffffff;
853   t0 = shr128_pair(t1, t0, 44);
854   h1 += (t0 & 0xfffffffffff) + c;
855   c = (h1 >> 44);
856   h1 &= 0xfffffffffff;
857   t1 = (t1 >> 24);
858   h2 += (t1)+c;
859 
860   store_u64_le(mac + 0, ((h0) | (h1 << 44)));
861   store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
862 }
863 
864 #endif  // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64
865