1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the 5 terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at 6 your option. The terms of these licenses can be found at: 7 8 - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 9 - OpenSSL license : https://www.openssl.org/source/license.html 10 - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 11 12 More information about the BLAKE2 hash function can be found at 13 https://blake2.net. 14 */ 15 #pragma once 16 #ifndef __BLAKE2S_LOAD_SSE41_H__ 17 #define __BLAKE2S_LOAD_SSE41_H__ 18 19 #define LOAD_MSG_0_1(buf) \ 20 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); 21 22 #define LOAD_MSG_0_2(buf) \ 23 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); 24 25 #define LOAD_MSG_0_3(buf) \ 26 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); 27 28 #define LOAD_MSG_0_4(buf) \ 29 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); 30 31 #define LOAD_MSG_1_1(buf) \ 32 t0 = _mm_blend_epi16(m1, m2, 0x0C); \ 33 t1 = _mm_slli_si128(m3, 4); \ 34 t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 35 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); 36 37 #define LOAD_MSG_1_2(buf) \ 38 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ 39 t1 = _mm_blend_epi16(m1,m3,0xC0); \ 40 t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 41 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 42 43 #define LOAD_MSG_1_3(buf) \ 44 t0 = _mm_slli_si128(m1, 4); \ 45 t1 = _mm_blend_epi16(m2, t0, 0x30); \ 46 t2 = _mm_blend_epi16(m0, t1, 0xF0); \ 47 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 48 49 #define LOAD_MSG_1_4(buf) \ 50 t0 = _mm_unpackhi_epi32(m0,m1); \ 51 t1 = _mm_slli_si128(m3, 4); \ 52 t2 = _mm_blend_epi16(t0, t1, 0x0C); \ 53 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 54 55 #define LOAD_MSG_2_1(buf) \ 56 t0 = _mm_unpackhi_epi32(m2,m3); \ 57 t1 = _mm_blend_epi16(m3,m1,0x0C); \ 58 t2 = _mm_blend_epi16(t0, t1, 0x0F); \ 59 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 60 61 #define LOAD_MSG_2_2(buf) \ 62 t0 = _mm_unpacklo_epi32(m2,m0); \ 63 t1 = _mm_blend_epi16(t0, m0, 0xF0); \ 64 t2 = _mm_slli_si128(m3, 8); \ 65 buf = _mm_blend_epi16(t1, t2, 0xC0); 66 67 #define LOAD_MSG_2_3(buf) \ 68 t0 = _mm_blend_epi16(m0, m2, 0x3C); \ 69 t1 = _mm_srli_si128(m1, 12); \ 70 t2 = _mm_blend_epi16(t0,t1,0x03); \ 71 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); 72 73 #define LOAD_MSG_2_4(buf) \ 74 t0 = _mm_slli_si128(m3, 4); \ 75 t1 = _mm_blend_epi16(m0, m1, 0x33); \ 76 t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 77 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); 78 79 #define LOAD_MSG_3_1(buf) \ 80 t0 = _mm_unpackhi_epi32(m0,m1); \ 81 t1 = _mm_unpackhi_epi32(t0, m2); \ 82 t2 = _mm_blend_epi16(t1, m3, 0x0C); \ 83 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 84 85 #define LOAD_MSG_3_2(buf) \ 86 t0 = _mm_slli_si128(m2, 8); \ 87 t1 = _mm_blend_epi16(m3,m0,0x0C); \ 88 t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 89 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 90 91 #define LOAD_MSG_3_3(buf) \ 92 t0 = _mm_blend_epi16(m0,m1,0x0F); \ 93 t1 = _mm_blend_epi16(t0, m3, 0xC0); \ 94 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 95 96 #define LOAD_MSG_3_4(buf) \ 97 t0 = _mm_unpacklo_epi32(m0,m2); \ 98 t1 = _mm_unpackhi_epi32(m1,m2); \ 99 buf = _mm_unpacklo_epi64(t1,t0); 100 101 #define LOAD_MSG_4_1(buf) \ 102 t0 = _mm_unpacklo_epi64(m1,m2); \ 103 t1 = _mm_unpackhi_epi64(m0,m2); \ 104 t2 = _mm_blend_epi16(t0,t1,0x33); \ 105 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 106 107 #define LOAD_MSG_4_2(buf) \ 108 t0 = _mm_unpackhi_epi64(m1,m3); \ 109 t1 = _mm_unpacklo_epi64(m0,m1); \ 110 buf = _mm_blend_epi16(t0,t1,0x33); 111 112 #define LOAD_MSG_4_3(buf) \ 113 t0 = _mm_unpackhi_epi64(m3,m1); \ 114 t1 = _mm_unpackhi_epi64(m2,m0); \ 115 buf = _mm_blend_epi16(t1,t0,0x33); 116 117 #define LOAD_MSG_4_4(buf) \ 118 t0 = _mm_blend_epi16(m0,m2,0x03); \ 119 t1 = _mm_slli_si128(t0, 8); \ 120 t2 = _mm_blend_epi16(t1,m3,0x0F); \ 121 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); 122 123 #define LOAD_MSG_5_1(buf) \ 124 t0 = _mm_unpackhi_epi32(m0,m1); \ 125 t1 = _mm_unpacklo_epi32(m0,m2); \ 126 buf = _mm_unpacklo_epi64(t0,t1); 127 128 #define LOAD_MSG_5_2(buf) \ 129 t0 = _mm_srli_si128(m2, 4); \ 130 t1 = _mm_blend_epi16(m0,m3,0x03); \ 131 buf = _mm_blend_epi16(t1,t0,0x3C); 132 133 #define LOAD_MSG_5_3(buf) \ 134 t0 = _mm_blend_epi16(m1,m0,0x0C); \ 135 t1 = _mm_srli_si128(m3, 4); \ 136 t2 = _mm_blend_epi16(t0,t1,0x30); \ 137 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); 138 139 #define LOAD_MSG_5_4(buf) \ 140 t0 = _mm_unpacklo_epi64(m1,m2); \ 141 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ 142 buf = _mm_blend_epi16(t0,t1,0x33); 143 144 #define LOAD_MSG_6_1(buf) \ 145 t0 = _mm_slli_si128(m1, 12); \ 146 t1 = _mm_blend_epi16(m0,m3,0x33); \ 147 buf = _mm_blend_epi16(t1,t0,0xC0); 148 149 #define LOAD_MSG_6_2(buf) \ 150 t0 = _mm_blend_epi16(m3,m2,0x30); \ 151 t1 = _mm_srli_si128(m1, 4); \ 152 t2 = _mm_blend_epi16(t0,t1,0x03); \ 153 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); 154 155 #define LOAD_MSG_6_3(buf) \ 156 t0 = _mm_unpacklo_epi64(m0,m2); \ 157 t1 = _mm_srli_si128(m1, 4); \ 158 buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); 159 160 #define LOAD_MSG_6_4(buf) \ 161 t0 = _mm_unpackhi_epi32(m1,m2); \ 162 t1 = _mm_unpackhi_epi64(m0,t0); \ 163 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 164 165 #define LOAD_MSG_7_1(buf) \ 166 t0 = _mm_unpackhi_epi32(m0,m1); \ 167 t1 = _mm_blend_epi16(t0,m3,0x0F); \ 168 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); 169 170 #define LOAD_MSG_7_2(buf) \ 171 t0 = _mm_blend_epi16(m2,m3,0x30); \ 172 t1 = _mm_srli_si128(m0,4); \ 173 t2 = _mm_blend_epi16(t0,t1,0x03); \ 174 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); 175 176 #define LOAD_MSG_7_3(buf) \ 177 t0 = _mm_unpackhi_epi64(m0,m3); \ 178 t1 = _mm_unpacklo_epi64(m1,m2); \ 179 t2 = _mm_blend_epi16(t0,t1,0x3C); \ 180 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); 181 182 #define LOAD_MSG_7_4(buf) \ 183 t0 = _mm_unpacklo_epi32(m0,m1); \ 184 t1 = _mm_unpackhi_epi32(m1,m2); \ 185 buf = _mm_unpacklo_epi64(t0,t1); 186 187 #define LOAD_MSG_8_1(buf) \ 188 t0 = _mm_unpackhi_epi32(m1,m3); \ 189 t1 = _mm_unpacklo_epi64(t0,m0); \ 190 t2 = _mm_blend_epi16(t1,m2,0xC0); \ 191 buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); 192 193 #define LOAD_MSG_8_2(buf) \ 194 t0 = _mm_unpackhi_epi32(m0,m3); \ 195 t1 = _mm_blend_epi16(m2,t0,0xF0); \ 196 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); 197 198 #define LOAD_MSG_8_3(buf) \ 199 t0 = _mm_blend_epi16(m2,m0,0x0C); \ 200 t1 = _mm_slli_si128(t0,4); \ 201 buf = _mm_blend_epi16(t1,m3,0x0F); 202 203 #define LOAD_MSG_8_4(buf) \ 204 t0 = _mm_blend_epi16(m1,m0,0x30); \ 205 buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); 206 207 #define LOAD_MSG_9_1(buf) \ 208 t0 = _mm_blend_epi16(m0,m2,0x03); \ 209 t1 = _mm_blend_epi16(m1,m2,0x30); \ 210 t2 = _mm_blend_epi16(t1,t0,0x0F); \ 211 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); 212 213 #define LOAD_MSG_9_2(buf) \ 214 t0 = _mm_slli_si128(m0,4); \ 215 t1 = _mm_blend_epi16(m1,t0,0xC0); \ 216 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); 217 218 #define LOAD_MSG_9_3(buf) \ 219 t0 = _mm_unpackhi_epi32(m0,m3); \ 220 t1 = _mm_unpacklo_epi32(m2,m3); \ 221 t2 = _mm_unpackhi_epi64(t0,t1); \ 222 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); 223 224 #define LOAD_MSG_9_4(buf) \ 225 t0 = _mm_blend_epi16(m3,m2,0xC0); \ 226 t1 = _mm_unpacklo_epi32(m0,m3); \ 227 t2 = _mm_blend_epi16(t0,t1,0x0F); \ 228 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); 229 230 #endif 231 232