• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2014, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 // This implementation of poly1305 is by Andrew Moon
16 // (https://github.com/floodyberry/poly1305-donna) and released as public
17 // domain. It implements SIMD vectorization based on the algorithm described in
18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 // block size
20 
21 #include <openssl/poly1305.h>
22 
23 #include "../internal.h"
24 
25 
26 #if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
27 
28 #include <emmintrin.h>
29 
load_u32_le(const uint8_t in[4])30 static uint32_t load_u32_le(const uint8_t in[4]) {
31   uint32_t ret;
32   OPENSSL_memcpy(&ret, in, 4);
33   return ret;
34 }
35 
load_u64_le(const uint8_t in[8])36 static uint64_t load_u64_le(const uint8_t in[8]) {
37   uint64_t ret;
38   OPENSSL_memcpy(&ret, in, 8);
39   return ret;
40 }
41 
store_u64_le(uint8_t out[8],uint64_t v)42 static void store_u64_le(uint8_t out[8], uint64_t v) {
43   OPENSSL_memcpy(out, &v, 8);
44 }
45 
46 typedef __m128i xmmi;
47 
48 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
49     (1 << 26) - 1, 0, (1 << 26) - 1, 0};
50 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
51 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
52     (1 << 24), 0, (1 << 24), 0};
53 
add128(uint128_t a,uint128_t b)54 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
55 
add128_64(uint128_t a,uint64_t b)56 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
57 
mul64x64_128(uint64_t a,uint64_t b)58 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
59   return (uint128_t)a * b;
60 }
61 
lo128(uint128_t a)62 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
63 
shr128(uint128_t v,const int shift)64 static inline uint64_t shr128(uint128_t v, const int shift) {
65   return (uint64_t)(v >> shift);
66 }
67 
shr128_pair(uint64_t hi,uint64_t lo,const int shift)68 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
69   return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
70 }
71 
72 typedef struct poly1305_power_t {
73   union {
74     xmmi v;
75     uint64_t u[2];
76     uint32_t d[4];
77   } R20, R21, R22, R23, R24, S21, S22, S23, S24;
78 } poly1305_power;
79 
80 typedef struct poly1305_state_internal_t {
81   poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
82                           bytes of free storage */
83   union {
84     xmmi H[5];  //  80 bytes
85     uint64_t HH[10];
86   };
87   // uint64_t r0,r1,r2;       [24 bytes]
88   // uint64_t pad0,pad1;      [16 bytes]
89   uint64_t started;        //   8 bytes
90   uint64_t leftover;       //   8 bytes
91   uint8_t buffer[64];      //  64 bytes
92 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
93                               alignment = 511 bytes raw */
94 
95 OPENSSL_STATIC_ASSERT(
96     sizeof(struct poly1305_state_internal_t) + 63 <= sizeof(poly1305_state),
97     "poly1305_state isn't large enough to hold aligned poly1305_state_internal_t");
98 
poly1305_aligned_state(poly1305_state * state)99 static inline poly1305_state_internal *poly1305_aligned_state(
100     poly1305_state *state) {
101   return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
102 }
103 
poly1305_min(size_t a,size_t b)104 static inline size_t poly1305_min(size_t a, size_t b) {
105   return (a < b) ? a : b;
106 }
107 
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])108 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
109   poly1305_state_internal *st = poly1305_aligned_state(state);
110   poly1305_power *p;
111   uint64_t r0, r1, r2;
112   uint64_t t0, t1;
113 
114   // clamp key
115   t0 = load_u64_le(key + 0);
116   t1 = load_u64_le(key + 8);
117   r0 = t0 & 0xffc0fffffff;
118   t0 >>= 44;
119   t0 |= t1 << 20;
120   r1 = t0 & 0xfffffc0ffff;
121   t1 >>= 24;
122   r2 = t1 & 0x00ffffffc0f;
123 
124   // store r in un-used space of st->P[1]
125   p = &st->P[1];
126   p->R20.d[1] = (uint32_t)(r0);
127   p->R20.d[3] = (uint32_t)(r0 >> 32);
128   p->R21.d[1] = (uint32_t)(r1);
129   p->R21.d[3] = (uint32_t)(r1 >> 32);
130   p->R22.d[1] = (uint32_t)(r2);
131   p->R22.d[3] = (uint32_t)(r2 >> 32);
132 
133   // store pad
134   p->R23.d[1] = load_u32_le(key + 16);
135   p->R23.d[3] = load_u32_le(key + 20);
136   p->R24.d[1] = load_u32_le(key + 24);
137   p->R24.d[3] = load_u32_le(key + 28);
138 
139   // H = 0
140   st->H[0] = _mm_setzero_si128();
141   st->H[1] = _mm_setzero_si128();
142   st->H[2] = _mm_setzero_si128();
143   st->H[3] = _mm_setzero_si128();
144   st->H[4] = _mm_setzero_si128();
145 
146   st->started = 0;
147   st->leftover = 0;
148 }
149 
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)150 static void poly1305_first_block(poly1305_state_internal *st,
151                                  const uint8_t *m) {
152   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
153   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
154   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
155   xmmi T5, T6;
156   poly1305_power *p;
157   uint128_t d[3];
158   uint64_t r0, r1, r2;
159   uint64_t r20, r21, r22, s22;
160   uint64_t pad0, pad1;
161   uint64_t c;
162   uint64_t i;
163 
164   // pull out stored info
165   p = &st->P[1];
166 
167   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
168   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
169   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
170   pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
171   pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
172 
173   // compute powers r^2,r^4
174   r20 = r0;
175   r21 = r1;
176   r22 = r2;
177   for (i = 0; i < 2; i++) {
178     s22 = r22 * (5 << 2);
179 
180     d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
181     d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
182     d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
183 
184     r20 = lo128(d[0]) & 0xfffffffffff;
185     c = shr128(d[0], 44);
186     d[1] = add128_64(d[1], c);
187     r21 = lo128(d[1]) & 0xfffffffffff;
188     c = shr128(d[1], 44);
189     d[2] = add128_64(d[2], c);
190     r22 = lo128(d[2]) & 0x3ffffffffff;
191     c = shr128(d[2], 42);
192     r20 += c * 5;
193     c = (r20 >> 44);
194     r20 = r20 & 0xfffffffffff;
195     r21 += c;
196 
197     p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
198                                  _MM_SHUFFLE(1, 0, 1, 0));
199     p->R21.v = _mm_shuffle_epi32(
200         _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
201         _MM_SHUFFLE(1, 0, 1, 0));
202     p->R22.v =
203         _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
204                           _MM_SHUFFLE(1, 0, 1, 0));
205     p->R23.v = _mm_shuffle_epi32(
206         _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
207         _MM_SHUFFLE(1, 0, 1, 0));
208     p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
209                                  _MM_SHUFFLE(1, 0, 1, 0));
210     p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
211     p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
212     p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
213     p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
214     p--;
215   }
216 
217   // put saved info back
218   p = &st->P[1];
219   p->R20.d[1] = (uint32_t)(r0);
220   p->R20.d[3] = (uint32_t)(r0 >> 32);
221   p->R21.d[1] = (uint32_t)(r1);
222   p->R21.d[3] = (uint32_t)(r1 >> 32);
223   p->R22.d[1] = (uint32_t)(r2);
224   p->R22.d[3] = (uint32_t)(r2 >> 32);
225   p->R23.d[1] = (uint32_t)(pad0);
226   p->R23.d[3] = (uint32_t)(pad0 >> 32);
227   p->R24.d[1] = (uint32_t)(pad1);
228   p->R24.d[3] = (uint32_t)(pad1 >> 32);
229 
230   // H = [Mx,My]
231   T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
232                           _mm_loadl_epi64((const xmmi *)(m + 16)));
233   T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
234                           _mm_loadl_epi64((const xmmi *)(m + 24)));
235   st->H[0] = _mm_and_si128(MMASK, T5);
236   st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
237   T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
238   st->H[2] = _mm_and_si128(MMASK, T5);
239   st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
240   st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
241 }
242 
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)243 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
244                             size_t bytes) {
245   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
246   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
247   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
248 
249   poly1305_power *p;
250   xmmi H0, H1, H2, H3, H4;
251   xmmi T0, T1, T2, T3, T4, T5, T6;
252   xmmi M0, M1, M2, M3, M4;
253   xmmi C1, C2;
254 
255   H0 = st->H[0];
256   H1 = st->H[1];
257   H2 = st->H[2];
258   H3 = st->H[3];
259   H4 = st->H[4];
260 
261   while (bytes >= 64) {
262     // H *= [r^4,r^4]
263     p = &st->P[0];
264     T0 = _mm_mul_epu32(H0, p->R20.v);
265     T1 = _mm_mul_epu32(H0, p->R21.v);
266     T2 = _mm_mul_epu32(H0, p->R22.v);
267     T3 = _mm_mul_epu32(H0, p->R23.v);
268     T4 = _mm_mul_epu32(H0, p->R24.v);
269     T5 = _mm_mul_epu32(H1, p->S24.v);
270     T6 = _mm_mul_epu32(H1, p->R20.v);
271     T0 = _mm_add_epi64(T0, T5);
272     T1 = _mm_add_epi64(T1, T6);
273     T5 = _mm_mul_epu32(H2, p->S23.v);
274     T6 = _mm_mul_epu32(H2, p->S24.v);
275     T0 = _mm_add_epi64(T0, T5);
276     T1 = _mm_add_epi64(T1, T6);
277     T5 = _mm_mul_epu32(H3, p->S22.v);
278     T6 = _mm_mul_epu32(H3, p->S23.v);
279     T0 = _mm_add_epi64(T0, T5);
280     T1 = _mm_add_epi64(T1, T6);
281     T5 = _mm_mul_epu32(H4, p->S21.v);
282     T6 = _mm_mul_epu32(H4, p->S22.v);
283     T0 = _mm_add_epi64(T0, T5);
284     T1 = _mm_add_epi64(T1, T6);
285     T5 = _mm_mul_epu32(H1, p->R21.v);
286     T6 = _mm_mul_epu32(H1, p->R22.v);
287     T2 = _mm_add_epi64(T2, T5);
288     T3 = _mm_add_epi64(T3, T6);
289     T5 = _mm_mul_epu32(H2, p->R20.v);
290     T6 = _mm_mul_epu32(H2, p->R21.v);
291     T2 = _mm_add_epi64(T2, T5);
292     T3 = _mm_add_epi64(T3, T6);
293     T5 = _mm_mul_epu32(H3, p->S24.v);
294     T6 = _mm_mul_epu32(H3, p->R20.v);
295     T2 = _mm_add_epi64(T2, T5);
296     T3 = _mm_add_epi64(T3, T6);
297     T5 = _mm_mul_epu32(H4, p->S23.v);
298     T6 = _mm_mul_epu32(H4, p->S24.v);
299     T2 = _mm_add_epi64(T2, T5);
300     T3 = _mm_add_epi64(T3, T6);
301     T5 = _mm_mul_epu32(H1, p->R23.v);
302     T4 = _mm_add_epi64(T4, T5);
303     T5 = _mm_mul_epu32(H2, p->R22.v);
304     T4 = _mm_add_epi64(T4, T5);
305     T5 = _mm_mul_epu32(H3, p->R21.v);
306     T4 = _mm_add_epi64(T4, T5);
307     T5 = _mm_mul_epu32(H4, p->R20.v);
308     T4 = _mm_add_epi64(T4, T5);
309 
310     // H += [Mx,My]*[r^2,r^2]
311     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
312                             _mm_loadl_epi64((const xmmi *)(m + 16)));
313     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
314                             _mm_loadl_epi64((const xmmi *)(m + 24)));
315     M0 = _mm_and_si128(MMASK, T5);
316     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
317     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
318     M2 = _mm_and_si128(MMASK, T5);
319     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
320     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
321 
322     p = &st->P[1];
323     T5 = _mm_mul_epu32(M0, p->R20.v);
324     T6 = _mm_mul_epu32(M0, p->R21.v);
325     T0 = _mm_add_epi64(T0, T5);
326     T1 = _mm_add_epi64(T1, T6);
327     T5 = _mm_mul_epu32(M1, p->S24.v);
328     T6 = _mm_mul_epu32(M1, p->R20.v);
329     T0 = _mm_add_epi64(T0, T5);
330     T1 = _mm_add_epi64(T1, T6);
331     T5 = _mm_mul_epu32(M2, p->S23.v);
332     T6 = _mm_mul_epu32(M2, p->S24.v);
333     T0 = _mm_add_epi64(T0, T5);
334     T1 = _mm_add_epi64(T1, T6);
335     T5 = _mm_mul_epu32(M3, p->S22.v);
336     T6 = _mm_mul_epu32(M3, p->S23.v);
337     T0 = _mm_add_epi64(T0, T5);
338     T1 = _mm_add_epi64(T1, T6);
339     T5 = _mm_mul_epu32(M4, p->S21.v);
340     T6 = _mm_mul_epu32(M4, p->S22.v);
341     T0 = _mm_add_epi64(T0, T5);
342     T1 = _mm_add_epi64(T1, T6);
343     T5 = _mm_mul_epu32(M0, p->R22.v);
344     T6 = _mm_mul_epu32(M0, p->R23.v);
345     T2 = _mm_add_epi64(T2, T5);
346     T3 = _mm_add_epi64(T3, T6);
347     T5 = _mm_mul_epu32(M1, p->R21.v);
348     T6 = _mm_mul_epu32(M1, p->R22.v);
349     T2 = _mm_add_epi64(T2, T5);
350     T3 = _mm_add_epi64(T3, T6);
351     T5 = _mm_mul_epu32(M2, p->R20.v);
352     T6 = _mm_mul_epu32(M2, p->R21.v);
353     T2 = _mm_add_epi64(T2, T5);
354     T3 = _mm_add_epi64(T3, T6);
355     T5 = _mm_mul_epu32(M3, p->S24.v);
356     T6 = _mm_mul_epu32(M3, p->R20.v);
357     T2 = _mm_add_epi64(T2, T5);
358     T3 = _mm_add_epi64(T3, T6);
359     T5 = _mm_mul_epu32(M4, p->S23.v);
360     T6 = _mm_mul_epu32(M4, p->S24.v);
361     T2 = _mm_add_epi64(T2, T5);
362     T3 = _mm_add_epi64(T3, T6);
363     T5 = _mm_mul_epu32(M0, p->R24.v);
364     T4 = _mm_add_epi64(T4, T5);
365     T5 = _mm_mul_epu32(M1, p->R23.v);
366     T4 = _mm_add_epi64(T4, T5);
367     T5 = _mm_mul_epu32(M2, p->R22.v);
368     T4 = _mm_add_epi64(T4, T5);
369     T5 = _mm_mul_epu32(M3, p->R21.v);
370     T4 = _mm_add_epi64(T4, T5);
371     T5 = _mm_mul_epu32(M4, p->R20.v);
372     T4 = _mm_add_epi64(T4, T5);
373 
374     // H += [Mx,My]
375     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
376                             _mm_loadl_epi64((const xmmi *)(m + 48)));
377     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
378                             _mm_loadl_epi64((const xmmi *)(m + 56)));
379     M0 = _mm_and_si128(MMASK, T5);
380     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
381     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
382     M2 = _mm_and_si128(MMASK, T5);
383     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
384     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
385 
386     T0 = _mm_add_epi64(T0, M0);
387     T1 = _mm_add_epi64(T1, M1);
388     T2 = _mm_add_epi64(T2, M2);
389     T3 = _mm_add_epi64(T3, M3);
390     T4 = _mm_add_epi64(T4, M4);
391 
392     // reduce
393     C1 = _mm_srli_epi64(T0, 26);
394     C2 = _mm_srli_epi64(T3, 26);
395     T0 = _mm_and_si128(T0, MMASK);
396     T3 = _mm_and_si128(T3, MMASK);
397     T1 = _mm_add_epi64(T1, C1);
398     T4 = _mm_add_epi64(T4, C2);
399     C1 = _mm_srli_epi64(T1, 26);
400     C2 = _mm_srli_epi64(T4, 26);
401     T1 = _mm_and_si128(T1, MMASK);
402     T4 = _mm_and_si128(T4, MMASK);
403     T2 = _mm_add_epi64(T2, C1);
404     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
405     C1 = _mm_srli_epi64(T2, 26);
406     C2 = _mm_srli_epi64(T0, 26);
407     T2 = _mm_and_si128(T2, MMASK);
408     T0 = _mm_and_si128(T0, MMASK);
409     T3 = _mm_add_epi64(T3, C1);
410     T1 = _mm_add_epi64(T1, C2);
411     C1 = _mm_srli_epi64(T3, 26);
412     T3 = _mm_and_si128(T3, MMASK);
413     T4 = _mm_add_epi64(T4, C1);
414 
415     // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
416     H0 = T0;
417     H1 = T1;
418     H2 = T2;
419     H3 = T3;
420     H4 = T4;
421 
422     m += 64;
423     bytes -= 64;
424   }
425 
426   st->H[0] = H0;
427   st->H[1] = H1;
428   st->H[2] = H2;
429   st->H[3] = H3;
430   st->H[4] = H4;
431 }
432 
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)433 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
434                                size_t bytes) {
435   const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
436   const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
437   const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
438 
439   poly1305_power *p;
440   xmmi H0, H1, H2, H3, H4;
441   xmmi M0, M1, M2, M3, M4;
442   xmmi T0, T1, T2, T3, T4, T5, T6;
443   xmmi C1, C2;
444 
445   uint64_t r0, r1, r2;
446   uint64_t t0, t1, t2, t3, t4;
447   uint64_t c;
448   size_t consumed = 0;
449 
450   H0 = st->H[0];
451   H1 = st->H[1];
452   H2 = st->H[2];
453   H3 = st->H[3];
454   H4 = st->H[4];
455 
456   // p = [r^2,r^2]
457   p = &st->P[1];
458 
459   if (bytes >= 32) {
460     // H *= [r^2,r^2]
461     T0 = _mm_mul_epu32(H0, p->R20.v);
462     T1 = _mm_mul_epu32(H0, p->R21.v);
463     T2 = _mm_mul_epu32(H0, p->R22.v);
464     T3 = _mm_mul_epu32(H0, p->R23.v);
465     T4 = _mm_mul_epu32(H0, p->R24.v);
466     T5 = _mm_mul_epu32(H1, p->S24.v);
467     T6 = _mm_mul_epu32(H1, p->R20.v);
468     T0 = _mm_add_epi64(T0, T5);
469     T1 = _mm_add_epi64(T1, T6);
470     T5 = _mm_mul_epu32(H2, p->S23.v);
471     T6 = _mm_mul_epu32(H2, p->S24.v);
472     T0 = _mm_add_epi64(T0, T5);
473     T1 = _mm_add_epi64(T1, T6);
474     T5 = _mm_mul_epu32(H3, p->S22.v);
475     T6 = _mm_mul_epu32(H3, p->S23.v);
476     T0 = _mm_add_epi64(T0, T5);
477     T1 = _mm_add_epi64(T1, T6);
478     T5 = _mm_mul_epu32(H4, p->S21.v);
479     T6 = _mm_mul_epu32(H4, p->S22.v);
480     T0 = _mm_add_epi64(T0, T5);
481     T1 = _mm_add_epi64(T1, T6);
482     T5 = _mm_mul_epu32(H1, p->R21.v);
483     T6 = _mm_mul_epu32(H1, p->R22.v);
484     T2 = _mm_add_epi64(T2, T5);
485     T3 = _mm_add_epi64(T3, T6);
486     T5 = _mm_mul_epu32(H2, p->R20.v);
487     T6 = _mm_mul_epu32(H2, p->R21.v);
488     T2 = _mm_add_epi64(T2, T5);
489     T3 = _mm_add_epi64(T3, T6);
490     T5 = _mm_mul_epu32(H3, p->S24.v);
491     T6 = _mm_mul_epu32(H3, p->R20.v);
492     T2 = _mm_add_epi64(T2, T5);
493     T3 = _mm_add_epi64(T3, T6);
494     T5 = _mm_mul_epu32(H4, p->S23.v);
495     T6 = _mm_mul_epu32(H4, p->S24.v);
496     T2 = _mm_add_epi64(T2, T5);
497     T3 = _mm_add_epi64(T3, T6);
498     T5 = _mm_mul_epu32(H1, p->R23.v);
499     T4 = _mm_add_epi64(T4, T5);
500     T5 = _mm_mul_epu32(H2, p->R22.v);
501     T4 = _mm_add_epi64(T4, T5);
502     T5 = _mm_mul_epu32(H3, p->R21.v);
503     T4 = _mm_add_epi64(T4, T5);
504     T5 = _mm_mul_epu32(H4, p->R20.v);
505     T4 = _mm_add_epi64(T4, T5);
506 
507     // H += [Mx,My]
508     T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
509                             _mm_loadl_epi64((const xmmi *)(m + 16)));
510     T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
511                             _mm_loadl_epi64((const xmmi *)(m + 24)));
512     M0 = _mm_and_si128(MMASK, T5);
513     M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
514     T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
515     M2 = _mm_and_si128(MMASK, T5);
516     M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
517     M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
518 
519     T0 = _mm_add_epi64(T0, M0);
520     T1 = _mm_add_epi64(T1, M1);
521     T2 = _mm_add_epi64(T2, M2);
522     T3 = _mm_add_epi64(T3, M3);
523     T4 = _mm_add_epi64(T4, M4);
524 
525     // reduce
526     C1 = _mm_srli_epi64(T0, 26);
527     C2 = _mm_srli_epi64(T3, 26);
528     T0 = _mm_and_si128(T0, MMASK);
529     T3 = _mm_and_si128(T3, MMASK);
530     T1 = _mm_add_epi64(T1, C1);
531     T4 = _mm_add_epi64(T4, C2);
532     C1 = _mm_srli_epi64(T1, 26);
533     C2 = _mm_srli_epi64(T4, 26);
534     T1 = _mm_and_si128(T1, MMASK);
535     T4 = _mm_and_si128(T4, MMASK);
536     T2 = _mm_add_epi64(T2, C1);
537     T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
538     C1 = _mm_srli_epi64(T2, 26);
539     C2 = _mm_srli_epi64(T0, 26);
540     T2 = _mm_and_si128(T2, MMASK);
541     T0 = _mm_and_si128(T0, MMASK);
542     T3 = _mm_add_epi64(T3, C1);
543     T1 = _mm_add_epi64(T1, C2);
544     C1 = _mm_srli_epi64(T3, 26);
545     T3 = _mm_and_si128(T3, MMASK);
546     T4 = _mm_add_epi64(T4, C1);
547 
548     // H = (H*[r^2,r^2] + [Mx,My])
549     H0 = T0;
550     H1 = T1;
551     H2 = T2;
552     H3 = T3;
553     H4 = T4;
554 
555     consumed = 32;
556   }
557 
558   // finalize, H *= [r^2,r]
559   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
560   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
561   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
562 
563   p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
564   p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
565   p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
566   p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
567   p->R24.d[2] = (uint32_t)((r2 >> 16));
568   p->S21.d[2] = p->R21.d[2] * 5;
569   p->S22.d[2] = p->R22.d[2] * 5;
570   p->S23.d[2] = p->R23.d[2] * 5;
571   p->S24.d[2] = p->R24.d[2] * 5;
572 
573   // H *= [r^2,r]
574   T0 = _mm_mul_epu32(H0, p->R20.v);
575   T1 = _mm_mul_epu32(H0, p->R21.v);
576   T2 = _mm_mul_epu32(H0, p->R22.v);
577   T3 = _mm_mul_epu32(H0, p->R23.v);
578   T4 = _mm_mul_epu32(H0, p->R24.v);
579   T5 = _mm_mul_epu32(H1, p->S24.v);
580   T6 = _mm_mul_epu32(H1, p->R20.v);
581   T0 = _mm_add_epi64(T0, T5);
582   T1 = _mm_add_epi64(T1, T6);
583   T5 = _mm_mul_epu32(H2, p->S23.v);
584   T6 = _mm_mul_epu32(H2, p->S24.v);
585   T0 = _mm_add_epi64(T0, T5);
586   T1 = _mm_add_epi64(T1, T6);
587   T5 = _mm_mul_epu32(H3, p->S22.v);
588   T6 = _mm_mul_epu32(H3, p->S23.v);
589   T0 = _mm_add_epi64(T0, T5);
590   T1 = _mm_add_epi64(T1, T6);
591   T5 = _mm_mul_epu32(H4, p->S21.v);
592   T6 = _mm_mul_epu32(H4, p->S22.v);
593   T0 = _mm_add_epi64(T0, T5);
594   T1 = _mm_add_epi64(T1, T6);
595   T5 = _mm_mul_epu32(H1, p->R21.v);
596   T6 = _mm_mul_epu32(H1, p->R22.v);
597   T2 = _mm_add_epi64(T2, T5);
598   T3 = _mm_add_epi64(T3, T6);
599   T5 = _mm_mul_epu32(H2, p->R20.v);
600   T6 = _mm_mul_epu32(H2, p->R21.v);
601   T2 = _mm_add_epi64(T2, T5);
602   T3 = _mm_add_epi64(T3, T6);
603   T5 = _mm_mul_epu32(H3, p->S24.v);
604   T6 = _mm_mul_epu32(H3, p->R20.v);
605   T2 = _mm_add_epi64(T2, T5);
606   T3 = _mm_add_epi64(T3, T6);
607   T5 = _mm_mul_epu32(H4, p->S23.v);
608   T6 = _mm_mul_epu32(H4, p->S24.v);
609   T2 = _mm_add_epi64(T2, T5);
610   T3 = _mm_add_epi64(T3, T6);
611   T5 = _mm_mul_epu32(H1, p->R23.v);
612   T4 = _mm_add_epi64(T4, T5);
613   T5 = _mm_mul_epu32(H2, p->R22.v);
614   T4 = _mm_add_epi64(T4, T5);
615   T5 = _mm_mul_epu32(H3, p->R21.v);
616   T4 = _mm_add_epi64(T4, T5);
617   T5 = _mm_mul_epu32(H4, p->R20.v);
618   T4 = _mm_add_epi64(T4, T5);
619 
620   C1 = _mm_srli_epi64(T0, 26);
621   C2 = _mm_srli_epi64(T3, 26);
622   T0 = _mm_and_si128(T0, MMASK);
623   T3 = _mm_and_si128(T3, MMASK);
624   T1 = _mm_add_epi64(T1, C1);
625   T4 = _mm_add_epi64(T4, C2);
626   C1 = _mm_srli_epi64(T1, 26);
627   C2 = _mm_srli_epi64(T4, 26);
628   T1 = _mm_and_si128(T1, MMASK);
629   T4 = _mm_and_si128(T4, MMASK);
630   T2 = _mm_add_epi64(T2, C1);
631   T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
632   C1 = _mm_srli_epi64(T2, 26);
633   C2 = _mm_srli_epi64(T0, 26);
634   T2 = _mm_and_si128(T2, MMASK);
635   T0 = _mm_and_si128(T0, MMASK);
636   T3 = _mm_add_epi64(T3, C1);
637   T1 = _mm_add_epi64(T1, C2);
638   C1 = _mm_srli_epi64(T3, 26);
639   T3 = _mm_and_si128(T3, MMASK);
640   T4 = _mm_add_epi64(T4, C1);
641 
642   // H = H[0]+H[1]
643   H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
644   H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
645   H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
646   H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
647   H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
648 
649   t0 = _mm_cvtsi128_si32(H0);
650   c = (t0 >> 26);
651   t0 &= 0x3ffffff;
652   t1 = _mm_cvtsi128_si32(H1) + c;
653   c = (t1 >> 26);
654   t1 &= 0x3ffffff;
655   t2 = _mm_cvtsi128_si32(H2) + c;
656   c = (t2 >> 26);
657   t2 &= 0x3ffffff;
658   t3 = _mm_cvtsi128_si32(H3) + c;
659   c = (t3 >> 26);
660   t3 &= 0x3ffffff;
661   t4 = _mm_cvtsi128_si32(H4) + c;
662   c = (t4 >> 26);
663   t4 &= 0x3ffffff;
664   t0 = t0 + (c * 5);
665   c = (t0 >> 26);
666   t0 &= 0x3ffffff;
667   t1 = t1 + c;
668 
669   st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
670   st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
671   st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
672 
673   return consumed;
674 }
675 
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)676 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
677                             size_t bytes) {
678   poly1305_state_internal *st = poly1305_aligned_state(state);
679   size_t want;
680 
681   // Work around a C language bug. See https://crbug.com/1019588.
682   if (bytes == 0) {
683     return;
684   }
685 
686   // need at least 32 initial bytes to start the accelerated branch
687   if (!st->started) {
688     if ((st->leftover == 0) && (bytes > 32)) {
689       poly1305_first_block(st, m);
690       m += 32;
691       bytes -= 32;
692     } else {
693       want = poly1305_min(32 - st->leftover, bytes);
694       OPENSSL_memcpy(st->buffer + st->leftover, m, want);
695       bytes -= want;
696       m += want;
697       st->leftover += want;
698       if ((st->leftover < 32) || (bytes == 0)) {
699         return;
700       }
701       poly1305_first_block(st, st->buffer);
702       st->leftover = 0;
703     }
704     st->started = 1;
705   }
706 
707   // handle leftover
708   if (st->leftover) {
709     want = poly1305_min(64 - st->leftover, bytes);
710     OPENSSL_memcpy(st->buffer + st->leftover, m, want);
711     bytes -= want;
712     m += want;
713     st->leftover += want;
714     if (st->leftover < 64) {
715       return;
716     }
717     poly1305_blocks(st, st->buffer, 64);
718     st->leftover = 0;
719   }
720 
721   // process 64 byte blocks
722   if (bytes >= 64) {
723     want = (bytes & ~63);
724     poly1305_blocks(st, m, want);
725     m += want;
726     bytes -= want;
727   }
728 
729   if (bytes) {
730     OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
731     st->leftover += bytes;
732   }
733 }
734 
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])735 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
736   poly1305_state_internal *st = poly1305_aligned_state(state);
737   size_t leftover = st->leftover;
738   uint8_t *m = st->buffer;
739   uint128_t d[3];
740   uint64_t h0, h1, h2;
741   uint64_t t0, t1;
742   uint64_t g0, g1, g2, c, nc;
743   uint64_t r0, r1, r2, s1, s2;
744   poly1305_power *p;
745 
746   if (st->started) {
747     size_t consumed = poly1305_combine(st, m, leftover);
748     leftover -= consumed;
749     m += consumed;
750   }
751 
752   // st->HH will either be 0 or have the combined result
753   h0 = st->HH[0];
754   h1 = st->HH[1];
755   h2 = st->HH[2];
756 
757   p = &st->P[1];
758   r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
759   r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
760   r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
761   s1 = r1 * (5 << 2);
762   s2 = r2 * (5 << 2);
763 
764   if (leftover < 16) {
765     goto poly1305_donna_atmost15bytes;
766   }
767 
768 poly1305_donna_atleast16bytes:
769   t0 = load_u64_le(m + 0);
770   t1 = load_u64_le(m + 8);
771   h0 += t0 & 0xfffffffffff;
772   t0 = shr128_pair(t1, t0, 44);
773   h1 += t0 & 0xfffffffffff;
774   h2 += (t1 >> 24) | ((uint64_t)1 << 40);
775 
776 poly1305_donna_mul:
777   d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
778                 mul64x64_128(h2, s1));
779   d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
780                 mul64x64_128(h2, s2));
781   d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
782                 mul64x64_128(h2, r0));
783   h0 = lo128(d[0]) & 0xfffffffffff;
784   c = shr128(d[0], 44);
785   d[1] = add128_64(d[1], c);
786   h1 = lo128(d[1]) & 0xfffffffffff;
787   c = shr128(d[1], 44);
788   d[2] = add128_64(d[2], c);
789   h2 = lo128(d[2]) & 0x3ffffffffff;
790   c = shr128(d[2], 42);
791   h0 += c * 5;
792 
793   m += 16;
794   leftover -= 16;
795   if (leftover >= 16) {
796     goto poly1305_donna_atleast16bytes;
797   }
798 
799 // final bytes
800 poly1305_donna_atmost15bytes:
801   if (!leftover) {
802     goto poly1305_donna_finish;
803   }
804 
805   m[leftover++] = 1;
806   OPENSSL_memset(m + leftover, 0, 16 - leftover);
807   leftover = 16;
808 
809   t0 = load_u64_le(m + 0);
810   t1 = load_u64_le(m + 8);
811   h0 += t0 & 0xfffffffffff;
812   t0 = shr128_pair(t1, t0, 44);
813   h1 += t0 & 0xfffffffffff;
814   h2 += (t1 >> 24);
815 
816   goto poly1305_donna_mul;
817 
818 poly1305_donna_finish:
819   c = (h0 >> 44);
820   h0 &= 0xfffffffffff;
821   h1 += c;
822   c = (h1 >> 44);
823   h1 &= 0xfffffffffff;
824   h2 += c;
825   c = (h2 >> 42);
826   h2 &= 0x3ffffffffff;
827   h0 += c * 5;
828 
829   g0 = h0 + 5;
830   c = (g0 >> 44);
831   g0 &= 0xfffffffffff;
832   g1 = h1 + c;
833   c = (g1 >> 44);
834   g1 &= 0xfffffffffff;
835   g2 = h2 + c - ((uint64_t)1 << 42);
836 
837   c = (g2 >> 63) - 1;
838   nc = ~c;
839   h0 = (h0 & nc) | (g0 & c);
840   h1 = (h1 & nc) | (g1 & c);
841   h2 = (h2 & nc) | (g2 & c);
842 
843   // pad
844   t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
845   t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
846   h0 += (t0 & 0xfffffffffff);
847   c = (h0 >> 44);
848   h0 &= 0xfffffffffff;
849   t0 = shr128_pair(t1, t0, 44);
850   h1 += (t0 & 0xfffffffffff) + c;
851   c = (h1 >> 44);
852   h1 &= 0xfffffffffff;
853   t1 = (t1 >> 24);
854   h2 += (t1)+c;
855 
856   store_u64_le(mac + 0, ((h0) | (h1 << 44)));
857   store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
858 }
859 
860 #endif  // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64
861