1 /* Copyright (c) 2014, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15 // This implementation of poly1305 is by Andrew Moon
16 // (https://github.com/floodyberry/poly1305-donna) and released as public
17 // domain. It implements SIMD vectorization based on the algorithm described in
18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 // block size
20
21 #include <openssl/poly1305.h>
22
23 #include "../internal.h"
24
25
26 #if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
27
28 #include <emmintrin.h>
29
30 #define U8TO64_LE(m) (*(const uint64_t *)(m))
31 #define U8TO32_LE(m) (*(const uint32_t *)(m))
32 #define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
33
34 typedef __m128i xmmi;
35
36 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
37 (1 << 26) - 1, 0, (1 << 26) - 1, 0};
38 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
39 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
40 (1 << 24), 0, (1 << 24), 0};
41
add128(uint128_t a,uint128_t b)42 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
43
add128_64(uint128_t a,uint64_t b)44 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
45
mul64x64_128(uint64_t a,uint64_t b)46 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
47 return (uint128_t)a * b;
48 }
49
lo128(uint128_t a)50 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
51
shr128(uint128_t v,const int shift)52 static inline uint64_t shr128(uint128_t v, const int shift) {
53 return (uint64_t)(v >> shift);
54 }
55
shr128_pair(uint64_t hi,uint64_t lo,const int shift)56 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
57 return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
58 }
59
60 typedef struct poly1305_power_t {
61 union {
62 xmmi v;
63 uint64_t u[2];
64 uint32_t d[4];
65 } R20, R21, R22, R23, R24, S21, S22, S23, S24;
66 } poly1305_power;
67
68 typedef struct poly1305_state_internal_t {
69 poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
70 bytes of free storage */
71 union {
72 xmmi H[5]; // 80 bytes
73 uint64_t HH[10];
74 };
75 // uint64_t r0,r1,r2; [24 bytes]
76 // uint64_t pad0,pad1; [16 bytes]
77 uint64_t started; // 8 bytes
78 uint64_t leftover; // 8 bytes
79 uint8_t buffer[64]; // 64 bytes
80 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
81 alignment = 511 bytes raw */
82
poly1305_aligned_state(poly1305_state * state)83 static inline poly1305_state_internal *poly1305_aligned_state(
84 poly1305_state *state) {
85 return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
86 }
87
poly1305_min(size_t a,size_t b)88 static inline size_t poly1305_min(size_t a, size_t b) {
89 return (a < b) ? a : b;
90 }
91
CRYPTO_poly1305_init(poly1305_state * state,const uint8_t key[32])92 void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
93 poly1305_state_internal *st = poly1305_aligned_state(state);
94 poly1305_power *p;
95 uint64_t r0, r1, r2;
96 uint64_t t0, t1;
97
98 // clamp key
99 t0 = U8TO64_LE(key + 0);
100 t1 = U8TO64_LE(key + 8);
101 r0 = t0 & 0xffc0fffffff;
102 t0 >>= 44;
103 t0 |= t1 << 20;
104 r1 = t0 & 0xfffffc0ffff;
105 t1 >>= 24;
106 r2 = t1 & 0x00ffffffc0f;
107
108 // store r in un-used space of st->P[1]
109 p = &st->P[1];
110 p->R20.d[1] = (uint32_t)(r0);
111 p->R20.d[3] = (uint32_t)(r0 >> 32);
112 p->R21.d[1] = (uint32_t)(r1);
113 p->R21.d[3] = (uint32_t)(r1 >> 32);
114 p->R22.d[1] = (uint32_t)(r2);
115 p->R22.d[3] = (uint32_t)(r2 >> 32);
116
117 // store pad
118 p->R23.d[1] = U8TO32_LE(key + 16);
119 p->R23.d[3] = U8TO32_LE(key + 20);
120 p->R24.d[1] = U8TO32_LE(key + 24);
121 p->R24.d[3] = U8TO32_LE(key + 28);
122
123 // H = 0
124 st->H[0] = _mm_setzero_si128();
125 st->H[1] = _mm_setzero_si128();
126 st->H[2] = _mm_setzero_si128();
127 st->H[3] = _mm_setzero_si128();
128 st->H[4] = _mm_setzero_si128();
129
130 st->started = 0;
131 st->leftover = 0;
132 }
133
poly1305_first_block(poly1305_state_internal * st,const uint8_t * m)134 static void poly1305_first_block(poly1305_state_internal *st,
135 const uint8_t *m) {
136 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
137 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
138 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
139 xmmi T5, T6;
140 poly1305_power *p;
141 uint128_t d[3];
142 uint64_t r0, r1, r2;
143 uint64_t r20, r21, r22, s22;
144 uint64_t pad0, pad1;
145 uint64_t c;
146 uint64_t i;
147
148 // pull out stored info
149 p = &st->P[1];
150
151 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
152 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
153 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
154 pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
155 pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
156
157 // compute powers r^2,r^4
158 r20 = r0;
159 r21 = r1;
160 r22 = r2;
161 for (i = 0; i < 2; i++) {
162 s22 = r22 * (5 << 2);
163
164 d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
165 d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
166 d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
167
168 r20 = lo128(d[0]) & 0xfffffffffff;
169 c = shr128(d[0], 44);
170 d[1] = add128_64(d[1], c);
171 r21 = lo128(d[1]) & 0xfffffffffff;
172 c = shr128(d[1], 44);
173 d[2] = add128_64(d[2], c);
174 r22 = lo128(d[2]) & 0x3ffffffffff;
175 c = shr128(d[2], 42);
176 r20 += c * 5;
177 c = (r20 >> 44);
178 r20 = r20 & 0xfffffffffff;
179 r21 += c;
180
181 p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
182 _MM_SHUFFLE(1, 0, 1, 0));
183 p->R21.v = _mm_shuffle_epi32(
184 _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
185 _MM_SHUFFLE(1, 0, 1, 0));
186 p->R22.v =
187 _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
188 _MM_SHUFFLE(1, 0, 1, 0));
189 p->R23.v = _mm_shuffle_epi32(
190 _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
191 _MM_SHUFFLE(1, 0, 1, 0));
192 p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
193 _MM_SHUFFLE(1, 0, 1, 0));
194 p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
195 p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
196 p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
197 p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
198 p--;
199 }
200
201 // put saved info back
202 p = &st->P[1];
203 p->R20.d[1] = (uint32_t)(r0);
204 p->R20.d[3] = (uint32_t)(r0 >> 32);
205 p->R21.d[1] = (uint32_t)(r1);
206 p->R21.d[3] = (uint32_t)(r1 >> 32);
207 p->R22.d[1] = (uint32_t)(r2);
208 p->R22.d[3] = (uint32_t)(r2 >> 32);
209 p->R23.d[1] = (uint32_t)(pad0);
210 p->R23.d[3] = (uint32_t)(pad0 >> 32);
211 p->R24.d[1] = (uint32_t)(pad1);
212 p->R24.d[3] = (uint32_t)(pad1 >> 32);
213
214 // H = [Mx,My]
215 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
216 _mm_loadl_epi64((const xmmi *)(m + 16)));
217 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
218 _mm_loadl_epi64((const xmmi *)(m + 24)));
219 st->H[0] = _mm_and_si128(MMASK, T5);
220 st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
221 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
222 st->H[2] = _mm_and_si128(MMASK, T5);
223 st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
224 st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
225 }
226
poly1305_blocks(poly1305_state_internal * st,const uint8_t * m,size_t bytes)227 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
228 size_t bytes) {
229 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
230 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
231 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
232
233 poly1305_power *p;
234 xmmi H0, H1, H2, H3, H4;
235 xmmi T0, T1, T2, T3, T4, T5, T6;
236 xmmi M0, M1, M2, M3, M4;
237 xmmi C1, C2;
238
239 H0 = st->H[0];
240 H1 = st->H[1];
241 H2 = st->H[2];
242 H3 = st->H[3];
243 H4 = st->H[4];
244
245 while (bytes >= 64) {
246 // H *= [r^4,r^4]
247 p = &st->P[0];
248 T0 = _mm_mul_epu32(H0, p->R20.v);
249 T1 = _mm_mul_epu32(H0, p->R21.v);
250 T2 = _mm_mul_epu32(H0, p->R22.v);
251 T3 = _mm_mul_epu32(H0, p->R23.v);
252 T4 = _mm_mul_epu32(H0, p->R24.v);
253 T5 = _mm_mul_epu32(H1, p->S24.v);
254 T6 = _mm_mul_epu32(H1, p->R20.v);
255 T0 = _mm_add_epi64(T0, T5);
256 T1 = _mm_add_epi64(T1, T6);
257 T5 = _mm_mul_epu32(H2, p->S23.v);
258 T6 = _mm_mul_epu32(H2, p->S24.v);
259 T0 = _mm_add_epi64(T0, T5);
260 T1 = _mm_add_epi64(T1, T6);
261 T5 = _mm_mul_epu32(H3, p->S22.v);
262 T6 = _mm_mul_epu32(H3, p->S23.v);
263 T0 = _mm_add_epi64(T0, T5);
264 T1 = _mm_add_epi64(T1, T6);
265 T5 = _mm_mul_epu32(H4, p->S21.v);
266 T6 = _mm_mul_epu32(H4, p->S22.v);
267 T0 = _mm_add_epi64(T0, T5);
268 T1 = _mm_add_epi64(T1, T6);
269 T5 = _mm_mul_epu32(H1, p->R21.v);
270 T6 = _mm_mul_epu32(H1, p->R22.v);
271 T2 = _mm_add_epi64(T2, T5);
272 T3 = _mm_add_epi64(T3, T6);
273 T5 = _mm_mul_epu32(H2, p->R20.v);
274 T6 = _mm_mul_epu32(H2, p->R21.v);
275 T2 = _mm_add_epi64(T2, T5);
276 T3 = _mm_add_epi64(T3, T6);
277 T5 = _mm_mul_epu32(H3, p->S24.v);
278 T6 = _mm_mul_epu32(H3, p->R20.v);
279 T2 = _mm_add_epi64(T2, T5);
280 T3 = _mm_add_epi64(T3, T6);
281 T5 = _mm_mul_epu32(H4, p->S23.v);
282 T6 = _mm_mul_epu32(H4, p->S24.v);
283 T2 = _mm_add_epi64(T2, T5);
284 T3 = _mm_add_epi64(T3, T6);
285 T5 = _mm_mul_epu32(H1, p->R23.v);
286 T4 = _mm_add_epi64(T4, T5);
287 T5 = _mm_mul_epu32(H2, p->R22.v);
288 T4 = _mm_add_epi64(T4, T5);
289 T5 = _mm_mul_epu32(H3, p->R21.v);
290 T4 = _mm_add_epi64(T4, T5);
291 T5 = _mm_mul_epu32(H4, p->R20.v);
292 T4 = _mm_add_epi64(T4, T5);
293
294 // H += [Mx,My]*[r^2,r^2]
295 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
296 _mm_loadl_epi64((const xmmi *)(m + 16)));
297 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
298 _mm_loadl_epi64((const xmmi *)(m + 24)));
299 M0 = _mm_and_si128(MMASK, T5);
300 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
301 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
302 M2 = _mm_and_si128(MMASK, T5);
303 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
304 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
305
306 p = &st->P[1];
307 T5 = _mm_mul_epu32(M0, p->R20.v);
308 T6 = _mm_mul_epu32(M0, p->R21.v);
309 T0 = _mm_add_epi64(T0, T5);
310 T1 = _mm_add_epi64(T1, T6);
311 T5 = _mm_mul_epu32(M1, p->S24.v);
312 T6 = _mm_mul_epu32(M1, p->R20.v);
313 T0 = _mm_add_epi64(T0, T5);
314 T1 = _mm_add_epi64(T1, T6);
315 T5 = _mm_mul_epu32(M2, p->S23.v);
316 T6 = _mm_mul_epu32(M2, p->S24.v);
317 T0 = _mm_add_epi64(T0, T5);
318 T1 = _mm_add_epi64(T1, T6);
319 T5 = _mm_mul_epu32(M3, p->S22.v);
320 T6 = _mm_mul_epu32(M3, p->S23.v);
321 T0 = _mm_add_epi64(T0, T5);
322 T1 = _mm_add_epi64(T1, T6);
323 T5 = _mm_mul_epu32(M4, p->S21.v);
324 T6 = _mm_mul_epu32(M4, p->S22.v);
325 T0 = _mm_add_epi64(T0, T5);
326 T1 = _mm_add_epi64(T1, T6);
327 T5 = _mm_mul_epu32(M0, p->R22.v);
328 T6 = _mm_mul_epu32(M0, p->R23.v);
329 T2 = _mm_add_epi64(T2, T5);
330 T3 = _mm_add_epi64(T3, T6);
331 T5 = _mm_mul_epu32(M1, p->R21.v);
332 T6 = _mm_mul_epu32(M1, p->R22.v);
333 T2 = _mm_add_epi64(T2, T5);
334 T3 = _mm_add_epi64(T3, T6);
335 T5 = _mm_mul_epu32(M2, p->R20.v);
336 T6 = _mm_mul_epu32(M2, p->R21.v);
337 T2 = _mm_add_epi64(T2, T5);
338 T3 = _mm_add_epi64(T3, T6);
339 T5 = _mm_mul_epu32(M3, p->S24.v);
340 T6 = _mm_mul_epu32(M3, p->R20.v);
341 T2 = _mm_add_epi64(T2, T5);
342 T3 = _mm_add_epi64(T3, T6);
343 T5 = _mm_mul_epu32(M4, p->S23.v);
344 T6 = _mm_mul_epu32(M4, p->S24.v);
345 T2 = _mm_add_epi64(T2, T5);
346 T3 = _mm_add_epi64(T3, T6);
347 T5 = _mm_mul_epu32(M0, p->R24.v);
348 T4 = _mm_add_epi64(T4, T5);
349 T5 = _mm_mul_epu32(M1, p->R23.v);
350 T4 = _mm_add_epi64(T4, T5);
351 T5 = _mm_mul_epu32(M2, p->R22.v);
352 T4 = _mm_add_epi64(T4, T5);
353 T5 = _mm_mul_epu32(M3, p->R21.v);
354 T4 = _mm_add_epi64(T4, T5);
355 T5 = _mm_mul_epu32(M4, p->R20.v);
356 T4 = _mm_add_epi64(T4, T5);
357
358 // H += [Mx,My]
359 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
360 _mm_loadl_epi64((const xmmi *)(m + 48)));
361 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
362 _mm_loadl_epi64((const xmmi *)(m + 56)));
363 M0 = _mm_and_si128(MMASK, T5);
364 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
365 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
366 M2 = _mm_and_si128(MMASK, T5);
367 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
368 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
369
370 T0 = _mm_add_epi64(T0, M0);
371 T1 = _mm_add_epi64(T1, M1);
372 T2 = _mm_add_epi64(T2, M2);
373 T3 = _mm_add_epi64(T3, M3);
374 T4 = _mm_add_epi64(T4, M4);
375
376 // reduce
377 C1 = _mm_srli_epi64(T0, 26);
378 C2 = _mm_srli_epi64(T3, 26);
379 T0 = _mm_and_si128(T0, MMASK);
380 T3 = _mm_and_si128(T3, MMASK);
381 T1 = _mm_add_epi64(T1, C1);
382 T4 = _mm_add_epi64(T4, C2);
383 C1 = _mm_srli_epi64(T1, 26);
384 C2 = _mm_srli_epi64(T4, 26);
385 T1 = _mm_and_si128(T1, MMASK);
386 T4 = _mm_and_si128(T4, MMASK);
387 T2 = _mm_add_epi64(T2, C1);
388 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
389 C1 = _mm_srli_epi64(T2, 26);
390 C2 = _mm_srli_epi64(T0, 26);
391 T2 = _mm_and_si128(T2, MMASK);
392 T0 = _mm_and_si128(T0, MMASK);
393 T3 = _mm_add_epi64(T3, C1);
394 T1 = _mm_add_epi64(T1, C2);
395 C1 = _mm_srli_epi64(T3, 26);
396 T3 = _mm_and_si128(T3, MMASK);
397 T4 = _mm_add_epi64(T4, C1);
398
399 // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
400 H0 = T0;
401 H1 = T1;
402 H2 = T2;
403 H3 = T3;
404 H4 = T4;
405
406 m += 64;
407 bytes -= 64;
408 }
409
410 st->H[0] = H0;
411 st->H[1] = H1;
412 st->H[2] = H2;
413 st->H[3] = H3;
414 st->H[4] = H4;
415 }
416
poly1305_combine(poly1305_state_internal * st,const uint8_t * m,size_t bytes)417 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
418 size_t bytes) {
419 const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
420 const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
421 const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
422
423 poly1305_power *p;
424 xmmi H0, H1, H2, H3, H4;
425 xmmi M0, M1, M2, M3, M4;
426 xmmi T0, T1, T2, T3, T4, T5, T6;
427 xmmi C1, C2;
428
429 uint64_t r0, r1, r2;
430 uint64_t t0, t1, t2, t3, t4;
431 uint64_t c;
432 size_t consumed = 0;
433
434 H0 = st->H[0];
435 H1 = st->H[1];
436 H2 = st->H[2];
437 H3 = st->H[3];
438 H4 = st->H[4];
439
440 // p = [r^2,r^2]
441 p = &st->P[1];
442
443 if (bytes >= 32) {
444 // H *= [r^2,r^2]
445 T0 = _mm_mul_epu32(H0, p->R20.v);
446 T1 = _mm_mul_epu32(H0, p->R21.v);
447 T2 = _mm_mul_epu32(H0, p->R22.v);
448 T3 = _mm_mul_epu32(H0, p->R23.v);
449 T4 = _mm_mul_epu32(H0, p->R24.v);
450 T5 = _mm_mul_epu32(H1, p->S24.v);
451 T6 = _mm_mul_epu32(H1, p->R20.v);
452 T0 = _mm_add_epi64(T0, T5);
453 T1 = _mm_add_epi64(T1, T6);
454 T5 = _mm_mul_epu32(H2, p->S23.v);
455 T6 = _mm_mul_epu32(H2, p->S24.v);
456 T0 = _mm_add_epi64(T0, T5);
457 T1 = _mm_add_epi64(T1, T6);
458 T5 = _mm_mul_epu32(H3, p->S22.v);
459 T6 = _mm_mul_epu32(H3, p->S23.v);
460 T0 = _mm_add_epi64(T0, T5);
461 T1 = _mm_add_epi64(T1, T6);
462 T5 = _mm_mul_epu32(H4, p->S21.v);
463 T6 = _mm_mul_epu32(H4, p->S22.v);
464 T0 = _mm_add_epi64(T0, T5);
465 T1 = _mm_add_epi64(T1, T6);
466 T5 = _mm_mul_epu32(H1, p->R21.v);
467 T6 = _mm_mul_epu32(H1, p->R22.v);
468 T2 = _mm_add_epi64(T2, T5);
469 T3 = _mm_add_epi64(T3, T6);
470 T5 = _mm_mul_epu32(H2, p->R20.v);
471 T6 = _mm_mul_epu32(H2, p->R21.v);
472 T2 = _mm_add_epi64(T2, T5);
473 T3 = _mm_add_epi64(T3, T6);
474 T5 = _mm_mul_epu32(H3, p->S24.v);
475 T6 = _mm_mul_epu32(H3, p->R20.v);
476 T2 = _mm_add_epi64(T2, T5);
477 T3 = _mm_add_epi64(T3, T6);
478 T5 = _mm_mul_epu32(H4, p->S23.v);
479 T6 = _mm_mul_epu32(H4, p->S24.v);
480 T2 = _mm_add_epi64(T2, T5);
481 T3 = _mm_add_epi64(T3, T6);
482 T5 = _mm_mul_epu32(H1, p->R23.v);
483 T4 = _mm_add_epi64(T4, T5);
484 T5 = _mm_mul_epu32(H2, p->R22.v);
485 T4 = _mm_add_epi64(T4, T5);
486 T5 = _mm_mul_epu32(H3, p->R21.v);
487 T4 = _mm_add_epi64(T4, T5);
488 T5 = _mm_mul_epu32(H4, p->R20.v);
489 T4 = _mm_add_epi64(T4, T5);
490
491 // H += [Mx,My]
492 T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
493 _mm_loadl_epi64((const xmmi *)(m + 16)));
494 T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
495 _mm_loadl_epi64((const xmmi *)(m + 24)));
496 M0 = _mm_and_si128(MMASK, T5);
497 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
498 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
499 M2 = _mm_and_si128(MMASK, T5);
500 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
501 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
502
503 T0 = _mm_add_epi64(T0, M0);
504 T1 = _mm_add_epi64(T1, M1);
505 T2 = _mm_add_epi64(T2, M2);
506 T3 = _mm_add_epi64(T3, M3);
507 T4 = _mm_add_epi64(T4, M4);
508
509 // reduce
510 C1 = _mm_srli_epi64(T0, 26);
511 C2 = _mm_srli_epi64(T3, 26);
512 T0 = _mm_and_si128(T0, MMASK);
513 T3 = _mm_and_si128(T3, MMASK);
514 T1 = _mm_add_epi64(T1, C1);
515 T4 = _mm_add_epi64(T4, C2);
516 C1 = _mm_srli_epi64(T1, 26);
517 C2 = _mm_srli_epi64(T4, 26);
518 T1 = _mm_and_si128(T1, MMASK);
519 T4 = _mm_and_si128(T4, MMASK);
520 T2 = _mm_add_epi64(T2, C1);
521 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
522 C1 = _mm_srli_epi64(T2, 26);
523 C2 = _mm_srli_epi64(T0, 26);
524 T2 = _mm_and_si128(T2, MMASK);
525 T0 = _mm_and_si128(T0, MMASK);
526 T3 = _mm_add_epi64(T3, C1);
527 T1 = _mm_add_epi64(T1, C2);
528 C1 = _mm_srli_epi64(T3, 26);
529 T3 = _mm_and_si128(T3, MMASK);
530 T4 = _mm_add_epi64(T4, C1);
531
532 // H = (H*[r^2,r^2] + [Mx,My])
533 H0 = T0;
534 H1 = T1;
535 H2 = T2;
536 H3 = T3;
537 H4 = T4;
538
539 consumed = 32;
540 }
541
542 // finalize, H *= [r^2,r]
543 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
544 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
545 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
546
547 p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
548 p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
549 p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
550 p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
551 p->R24.d[2] = (uint32_t)((r2 >> 16));
552 p->S21.d[2] = p->R21.d[2] * 5;
553 p->S22.d[2] = p->R22.d[2] * 5;
554 p->S23.d[2] = p->R23.d[2] * 5;
555 p->S24.d[2] = p->R24.d[2] * 5;
556
557 // H *= [r^2,r]
558 T0 = _mm_mul_epu32(H0, p->R20.v);
559 T1 = _mm_mul_epu32(H0, p->R21.v);
560 T2 = _mm_mul_epu32(H0, p->R22.v);
561 T3 = _mm_mul_epu32(H0, p->R23.v);
562 T4 = _mm_mul_epu32(H0, p->R24.v);
563 T5 = _mm_mul_epu32(H1, p->S24.v);
564 T6 = _mm_mul_epu32(H1, p->R20.v);
565 T0 = _mm_add_epi64(T0, T5);
566 T1 = _mm_add_epi64(T1, T6);
567 T5 = _mm_mul_epu32(H2, p->S23.v);
568 T6 = _mm_mul_epu32(H2, p->S24.v);
569 T0 = _mm_add_epi64(T0, T5);
570 T1 = _mm_add_epi64(T1, T6);
571 T5 = _mm_mul_epu32(H3, p->S22.v);
572 T6 = _mm_mul_epu32(H3, p->S23.v);
573 T0 = _mm_add_epi64(T0, T5);
574 T1 = _mm_add_epi64(T1, T6);
575 T5 = _mm_mul_epu32(H4, p->S21.v);
576 T6 = _mm_mul_epu32(H4, p->S22.v);
577 T0 = _mm_add_epi64(T0, T5);
578 T1 = _mm_add_epi64(T1, T6);
579 T5 = _mm_mul_epu32(H1, p->R21.v);
580 T6 = _mm_mul_epu32(H1, p->R22.v);
581 T2 = _mm_add_epi64(T2, T5);
582 T3 = _mm_add_epi64(T3, T6);
583 T5 = _mm_mul_epu32(H2, p->R20.v);
584 T6 = _mm_mul_epu32(H2, p->R21.v);
585 T2 = _mm_add_epi64(T2, T5);
586 T3 = _mm_add_epi64(T3, T6);
587 T5 = _mm_mul_epu32(H3, p->S24.v);
588 T6 = _mm_mul_epu32(H3, p->R20.v);
589 T2 = _mm_add_epi64(T2, T5);
590 T3 = _mm_add_epi64(T3, T6);
591 T5 = _mm_mul_epu32(H4, p->S23.v);
592 T6 = _mm_mul_epu32(H4, p->S24.v);
593 T2 = _mm_add_epi64(T2, T5);
594 T3 = _mm_add_epi64(T3, T6);
595 T5 = _mm_mul_epu32(H1, p->R23.v);
596 T4 = _mm_add_epi64(T4, T5);
597 T5 = _mm_mul_epu32(H2, p->R22.v);
598 T4 = _mm_add_epi64(T4, T5);
599 T5 = _mm_mul_epu32(H3, p->R21.v);
600 T4 = _mm_add_epi64(T4, T5);
601 T5 = _mm_mul_epu32(H4, p->R20.v);
602 T4 = _mm_add_epi64(T4, T5);
603
604 C1 = _mm_srli_epi64(T0, 26);
605 C2 = _mm_srli_epi64(T3, 26);
606 T0 = _mm_and_si128(T0, MMASK);
607 T3 = _mm_and_si128(T3, MMASK);
608 T1 = _mm_add_epi64(T1, C1);
609 T4 = _mm_add_epi64(T4, C2);
610 C1 = _mm_srli_epi64(T1, 26);
611 C2 = _mm_srli_epi64(T4, 26);
612 T1 = _mm_and_si128(T1, MMASK);
613 T4 = _mm_and_si128(T4, MMASK);
614 T2 = _mm_add_epi64(T2, C1);
615 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
616 C1 = _mm_srli_epi64(T2, 26);
617 C2 = _mm_srli_epi64(T0, 26);
618 T2 = _mm_and_si128(T2, MMASK);
619 T0 = _mm_and_si128(T0, MMASK);
620 T3 = _mm_add_epi64(T3, C1);
621 T1 = _mm_add_epi64(T1, C2);
622 C1 = _mm_srli_epi64(T3, 26);
623 T3 = _mm_and_si128(T3, MMASK);
624 T4 = _mm_add_epi64(T4, C1);
625
626 // H = H[0]+H[1]
627 H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
628 H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
629 H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
630 H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
631 H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
632
633 t0 = _mm_cvtsi128_si32(H0);
634 c = (t0 >> 26);
635 t0 &= 0x3ffffff;
636 t1 = _mm_cvtsi128_si32(H1) + c;
637 c = (t1 >> 26);
638 t1 &= 0x3ffffff;
639 t2 = _mm_cvtsi128_si32(H2) + c;
640 c = (t2 >> 26);
641 t2 &= 0x3ffffff;
642 t3 = _mm_cvtsi128_si32(H3) + c;
643 c = (t3 >> 26);
644 t3 &= 0x3ffffff;
645 t4 = _mm_cvtsi128_si32(H4) + c;
646 c = (t4 >> 26);
647 t4 &= 0x3ffffff;
648 t0 = t0 + (c * 5);
649 c = (t0 >> 26);
650 t0 &= 0x3ffffff;
651 t1 = t1 + c;
652
653 st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
654 st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
655 st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
656
657 return consumed;
658 }
659
CRYPTO_poly1305_update(poly1305_state * state,const uint8_t * m,size_t bytes)660 void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
661 size_t bytes) {
662 poly1305_state_internal *st = poly1305_aligned_state(state);
663 size_t want;
664
665 // need at least 32 initial bytes to start the accelerated branch
666 if (!st->started) {
667 if ((st->leftover == 0) && (bytes > 32)) {
668 poly1305_first_block(st, m);
669 m += 32;
670 bytes -= 32;
671 } else {
672 want = poly1305_min(32 - st->leftover, bytes);
673 OPENSSL_memcpy(st->buffer + st->leftover, m, want);
674 bytes -= want;
675 m += want;
676 st->leftover += want;
677 if ((st->leftover < 32) || (bytes == 0)) {
678 return;
679 }
680 poly1305_first_block(st, st->buffer);
681 st->leftover = 0;
682 }
683 st->started = 1;
684 }
685
686 // handle leftover
687 if (st->leftover) {
688 want = poly1305_min(64 - st->leftover, bytes);
689 OPENSSL_memcpy(st->buffer + st->leftover, m, want);
690 bytes -= want;
691 m += want;
692 st->leftover += want;
693 if (st->leftover < 64) {
694 return;
695 }
696 poly1305_blocks(st, st->buffer, 64);
697 st->leftover = 0;
698 }
699
700 // process 64 byte blocks
701 if (bytes >= 64) {
702 want = (bytes & ~63);
703 poly1305_blocks(st, m, want);
704 m += want;
705 bytes -= want;
706 }
707
708 if (bytes) {
709 OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
710 st->leftover += bytes;
711 }
712 }
713
CRYPTO_poly1305_finish(poly1305_state * state,uint8_t mac[16])714 void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
715 poly1305_state_internal *st = poly1305_aligned_state(state);
716 size_t leftover = st->leftover;
717 uint8_t *m = st->buffer;
718 uint128_t d[3];
719 uint64_t h0, h1, h2;
720 uint64_t t0, t1;
721 uint64_t g0, g1, g2, c, nc;
722 uint64_t r0, r1, r2, s1, s2;
723 poly1305_power *p;
724
725 if (st->started) {
726 size_t consumed = poly1305_combine(st, m, leftover);
727 leftover -= consumed;
728 m += consumed;
729 }
730
731 // st->HH will either be 0 or have the combined result
732 h0 = st->HH[0];
733 h1 = st->HH[1];
734 h2 = st->HH[2];
735
736 p = &st->P[1];
737 r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
738 r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
739 r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
740 s1 = r1 * (5 << 2);
741 s2 = r2 * (5 << 2);
742
743 if (leftover < 16) {
744 goto poly1305_donna_atmost15bytes;
745 }
746
747 poly1305_donna_atleast16bytes:
748 t0 = U8TO64_LE(m + 0);
749 t1 = U8TO64_LE(m + 8);
750 h0 += t0 & 0xfffffffffff;
751 t0 = shr128_pair(t1, t0, 44);
752 h1 += t0 & 0xfffffffffff;
753 h2 += (t1 >> 24) | ((uint64_t)1 << 40);
754
755 poly1305_donna_mul:
756 d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
757 mul64x64_128(h2, s1));
758 d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
759 mul64x64_128(h2, s2));
760 d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
761 mul64x64_128(h2, r0));
762 h0 = lo128(d[0]) & 0xfffffffffff;
763 c = shr128(d[0], 44);
764 d[1] = add128_64(d[1], c);
765 h1 = lo128(d[1]) & 0xfffffffffff;
766 c = shr128(d[1], 44);
767 d[2] = add128_64(d[2], c);
768 h2 = lo128(d[2]) & 0x3ffffffffff;
769 c = shr128(d[2], 42);
770 h0 += c * 5;
771
772 m += 16;
773 leftover -= 16;
774 if (leftover >= 16) {
775 goto poly1305_donna_atleast16bytes;
776 }
777
778 // final bytes
779 poly1305_donna_atmost15bytes:
780 if (!leftover) {
781 goto poly1305_donna_finish;
782 }
783
784 m[leftover++] = 1;
785 OPENSSL_memset(m + leftover, 0, 16 - leftover);
786 leftover = 16;
787
788 t0 = U8TO64_LE(m + 0);
789 t1 = U8TO64_LE(m + 8);
790 h0 += t0 & 0xfffffffffff;
791 t0 = shr128_pair(t1, t0, 44);
792 h1 += t0 & 0xfffffffffff;
793 h2 += (t1 >> 24);
794
795 goto poly1305_donna_mul;
796
797 poly1305_donna_finish:
798 c = (h0 >> 44);
799 h0 &= 0xfffffffffff;
800 h1 += c;
801 c = (h1 >> 44);
802 h1 &= 0xfffffffffff;
803 h2 += c;
804 c = (h2 >> 42);
805 h2 &= 0x3ffffffffff;
806 h0 += c * 5;
807
808 g0 = h0 + 5;
809 c = (g0 >> 44);
810 g0 &= 0xfffffffffff;
811 g1 = h1 + c;
812 c = (g1 >> 44);
813 g1 &= 0xfffffffffff;
814 g2 = h2 + c - ((uint64_t)1 << 42);
815
816 c = (g2 >> 63) - 1;
817 nc = ~c;
818 h0 = (h0 & nc) | (g0 & c);
819 h1 = (h1 & nc) | (g1 & c);
820 h2 = (h2 & nc) | (g2 & c);
821
822 // pad
823 t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
824 t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
825 h0 += (t0 & 0xfffffffffff);
826 c = (h0 >> 44);
827 h0 &= 0xfffffffffff;
828 t0 = shr128_pair(t1, t0, 44);
829 h1 += (t0 & 0xfffffffffff) + c;
830 c = (h1 >> 44);
831 h1 &= 0xfffffffffff;
832 t1 = (t1 >> 24);
833 h2 += (t1)+c;
834
835 U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
836 U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
837 }
838
839 #endif // !OPENSSL_WINDOWS && OPENSSL_X86_64
840