1 // AVX512 algorithm is based on permutevar and multishift. The code is based on
2 // https://github.com/WojciechMula/base64simd which is under BSD-2 license.
3
4 static inline __m512i
enc_reshuffle_translate(const __m512i input)5 enc_reshuffle_translate (const __m512i input)
6 {
7 // 32-bit input
8 // [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
9 // b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
10 // output order [1, 2, 0, 1]
11 // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
12 // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
13
14 const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
15 0x04050304,
16 0x07080607,
17 0x0a0b090a,
18 0x0d0e0c0d,
19 0x10110f10,
20 0x13141213,
21 0x16171516,
22 0x191a1819,
23 0x1c1d1b1c,
24 0x1f201e1f,
25 0x22232122,
26 0x25262425,
27 0x28292728,
28 0x2b2c2a2b,
29 0x2e2f2d2e);
30
31 // Reorder bytes
32 // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
33 // a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
34 const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
35
36 // After multishift a single 32-bit lane has following layout
37 // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
38 // a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
39 // (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
40
41 // 48, 54, 36, 42, 16, 22, 4, 10
42 const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
43 __m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
44
45 // Translate immediatedly after reshuffled.
46 const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
47
48 // Translation 6-bit values to ASCII.
49 return _mm512_permutexvar_epi8(shuffled_in, lookup);
50 }
51