• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2    BLAKE2 reference source code package - optimized C implementations
3 
4    Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
5    terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6    your option.  The terms of these licenses can be found at:
7 
8    - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9    - OpenSSL license   : https://www.openssl.org/source/license.html
10    - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
11 
12    More information about the BLAKE2 hash function can be found at
13    https://blake2.net.
14 */
15 #pragma once
16 #ifndef __BLAKE2S_LOAD_SSE41_H__
17 #define __BLAKE2S_LOAD_SSE41_H__
18 
19 #define LOAD_MSG_0_1(buf) \
20 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
21 
22 #define LOAD_MSG_0_2(buf) \
23 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
24 
25 #define LOAD_MSG_0_3(buf) \
26 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
27 
28 #define LOAD_MSG_0_4(buf) \
29 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
30 
31 #define LOAD_MSG_1_1(buf) \
32 t0 = _mm_blend_epi16(m1, m2, 0x0C); \
33 t1 = _mm_slli_si128(m3, 4); \
34 t2 = _mm_blend_epi16(t0, t1, 0xF0); \
35 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
36 
37 #define LOAD_MSG_1_2(buf) \
38 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
39 t1 = _mm_blend_epi16(m1,m3,0xC0); \
40 t2 = _mm_blend_epi16(t0, t1, 0xF0); \
41 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
42 
43 #define LOAD_MSG_1_3(buf) \
44 t0 = _mm_slli_si128(m1, 4); \
45 t1 = _mm_blend_epi16(m2, t0, 0x30); \
46 t2 = _mm_blend_epi16(m0, t1, 0xF0); \
47 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
48 
49 #define LOAD_MSG_1_4(buf) \
50 t0 = _mm_unpackhi_epi32(m0,m1); \
51 t1 = _mm_slli_si128(m3, 4); \
52 t2 = _mm_blend_epi16(t0, t1, 0x0C); \
53 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
54 
55 #define LOAD_MSG_2_1(buf) \
56 t0 = _mm_unpackhi_epi32(m2,m3); \
57 t1 = _mm_blend_epi16(m3,m1,0x0C); \
58 t2 = _mm_blend_epi16(t0, t1, 0x0F); \
59 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
60 
61 #define LOAD_MSG_2_2(buf) \
62 t0 = _mm_unpacklo_epi32(m2,m0); \
63 t1 = _mm_blend_epi16(t0, m0, 0xF0); \
64 t2 = _mm_slli_si128(m3, 8); \
65 buf = _mm_blend_epi16(t1, t2, 0xC0);
66 
67 #define LOAD_MSG_2_3(buf) \
68 t0 = _mm_blend_epi16(m0, m2, 0x3C); \
69 t1 = _mm_srli_si128(m1, 12); \
70 t2 = _mm_blend_epi16(t0,t1,0x03); \
71 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
72 
73 #define LOAD_MSG_2_4(buf) \
74 t0 = _mm_slli_si128(m3, 4); \
75 t1 = _mm_blend_epi16(m0, m1, 0x33); \
76 t2 = _mm_blend_epi16(t1, t0, 0xC0); \
77 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
78 
79 #define LOAD_MSG_3_1(buf) \
80 t0 = _mm_unpackhi_epi32(m0,m1); \
81 t1 = _mm_unpackhi_epi32(t0, m2); \
82 t2 = _mm_blend_epi16(t1, m3, 0x0C); \
83 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
84 
85 #define LOAD_MSG_3_2(buf) \
86 t0 = _mm_slli_si128(m2, 8); \
87 t1 = _mm_blend_epi16(m3,m0,0x0C); \
88 t2 = _mm_blend_epi16(t1, t0, 0xC0); \
89 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
90 
91 #define LOAD_MSG_3_3(buf) \
92 t0 = _mm_blend_epi16(m0,m1,0x0F); \
93 t1 = _mm_blend_epi16(t0, m3, 0xC0); \
94 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
95 
96 #define LOAD_MSG_3_4(buf) \
97 t0 = _mm_unpacklo_epi32(m0,m2); \
98 t1 = _mm_unpackhi_epi32(m1,m2); \
99 buf = _mm_unpacklo_epi64(t1,t0);
100 
101 #define LOAD_MSG_4_1(buf) \
102 t0 = _mm_unpacklo_epi64(m1,m2); \
103 t1 = _mm_unpackhi_epi64(m0,m2); \
104 t2 = _mm_blend_epi16(t0,t1,0x33); \
105 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
106 
107 #define LOAD_MSG_4_2(buf) \
108 t0 = _mm_unpackhi_epi64(m1,m3); \
109 t1 = _mm_unpacklo_epi64(m0,m1); \
110 buf = _mm_blend_epi16(t0,t1,0x33);
111 
112 #define LOAD_MSG_4_3(buf) \
113 t0 = _mm_unpackhi_epi64(m3,m1); \
114 t1 = _mm_unpackhi_epi64(m2,m0); \
115 buf = _mm_blend_epi16(t1,t0,0x33);
116 
117 #define LOAD_MSG_4_4(buf) \
118 t0 = _mm_blend_epi16(m0,m2,0x03); \
119 t1 = _mm_slli_si128(t0, 8); \
120 t2 = _mm_blend_epi16(t1,m3,0x0F); \
121 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
122 
123 #define LOAD_MSG_5_1(buf) \
124 t0 = _mm_unpackhi_epi32(m0,m1); \
125 t1 = _mm_unpacklo_epi32(m0,m2); \
126 buf = _mm_unpacklo_epi64(t0,t1);
127 
128 #define LOAD_MSG_5_2(buf) \
129 t0 = _mm_srli_si128(m2, 4); \
130 t1 = _mm_blend_epi16(m0,m3,0x03); \
131 buf = _mm_blend_epi16(t1,t0,0x3C);
132 
133 #define LOAD_MSG_5_3(buf) \
134 t0 = _mm_blend_epi16(m1,m0,0x0C); \
135 t1 = _mm_srli_si128(m3, 4); \
136 t2 = _mm_blend_epi16(t0,t1,0x30); \
137 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
138 
139 #define LOAD_MSG_5_4(buf) \
140 t0 = _mm_unpacklo_epi64(m1,m2); \
141 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
142 buf = _mm_blend_epi16(t0,t1,0x33);
143 
144 #define LOAD_MSG_6_1(buf) \
145 t0 = _mm_slli_si128(m1, 12); \
146 t1 = _mm_blend_epi16(m0,m3,0x33); \
147 buf = _mm_blend_epi16(t1,t0,0xC0);
148 
149 #define LOAD_MSG_6_2(buf) \
150 t0 = _mm_blend_epi16(m3,m2,0x30); \
151 t1 = _mm_srli_si128(m1, 4); \
152 t2 = _mm_blend_epi16(t0,t1,0x03); \
153 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
154 
155 #define LOAD_MSG_6_3(buf) \
156 t0 = _mm_unpacklo_epi64(m0,m2); \
157 t1 = _mm_srli_si128(m1, 4); \
158 buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
159 
160 #define LOAD_MSG_6_4(buf) \
161 t0 = _mm_unpackhi_epi32(m1,m2); \
162 t1 = _mm_unpackhi_epi64(m0,t0); \
163 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
164 
165 #define LOAD_MSG_7_1(buf) \
166 t0 = _mm_unpackhi_epi32(m0,m1); \
167 t1 = _mm_blend_epi16(t0,m3,0x0F); \
168 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
169 
170 #define LOAD_MSG_7_2(buf) \
171 t0 = _mm_blend_epi16(m2,m3,0x30); \
172 t1 = _mm_srli_si128(m0,4); \
173 t2 = _mm_blend_epi16(t0,t1,0x03); \
174 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
175 
176 #define LOAD_MSG_7_3(buf) \
177 t0 = _mm_unpackhi_epi64(m0,m3); \
178 t1 = _mm_unpacklo_epi64(m1,m2); \
179 t2 = _mm_blend_epi16(t0,t1,0x3C); \
180 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
181 
182 #define LOAD_MSG_7_4(buf) \
183 t0 = _mm_unpacklo_epi32(m0,m1); \
184 t1 = _mm_unpackhi_epi32(m1,m2); \
185 buf = _mm_unpacklo_epi64(t0,t1);
186 
187 #define LOAD_MSG_8_1(buf) \
188 t0 = _mm_unpackhi_epi32(m1,m3); \
189 t1 = _mm_unpacklo_epi64(t0,m0); \
190 t2 = _mm_blend_epi16(t1,m2,0xC0); \
191 buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
192 
193 #define LOAD_MSG_8_2(buf) \
194 t0 = _mm_unpackhi_epi32(m0,m3); \
195 t1 = _mm_blend_epi16(m2,t0,0xF0); \
196 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
197 
198 #define LOAD_MSG_8_3(buf) \
199 t0 = _mm_blend_epi16(m2,m0,0x0C); \
200 t1 = _mm_slli_si128(t0,4); \
201 buf = _mm_blend_epi16(t1,m3,0x0F);
202 
203 #define LOAD_MSG_8_4(buf) \
204 t0 = _mm_blend_epi16(m1,m0,0x30); \
205 buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
206 
207 #define LOAD_MSG_9_1(buf) \
208 t0 = _mm_blend_epi16(m0,m2,0x03); \
209 t1 = _mm_blend_epi16(m1,m2,0x30); \
210 t2 = _mm_blend_epi16(t1,t0,0x0F); \
211 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
212 
213 #define LOAD_MSG_9_2(buf) \
214 t0 = _mm_slli_si128(m0,4); \
215 t1 = _mm_blend_epi16(m1,t0,0xC0); \
216 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
217 
218 #define LOAD_MSG_9_3(buf) \
219 t0 = _mm_unpackhi_epi32(m0,m3); \
220 t1 = _mm_unpacklo_epi32(m2,m3); \
221 t2 = _mm_unpackhi_epi64(t0,t1); \
222 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
223 
224 #define LOAD_MSG_9_4(buf) \
225 t0 = _mm_blend_epi16(m3,m2,0xC0); \
226 t1 = _mm_unpacklo_epi32(m0,m3); \
227 t2 = _mm_blend_epi16(t0,t1,0x0F); \
228 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
229 
230 #endif
231 
232