• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 static inline int
dec_loop_avx2_inner(const uint8_t ** s,uint8_t ** o,size_t * rounds)2 dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
3 {
4 	const __m256i lut_lo = _mm256_setr_epi8(
5 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
6 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
7 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
8 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
9 
10 	const __m256i lut_hi = _mm256_setr_epi8(
11 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
12 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
13 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
14 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
15 
16 	const __m256i lut_roll = _mm256_setr_epi8(
17 		0,  16,  19,   4, -65, -65, -71, -71,
18 		0,   0,   0,   0,   0,   0,   0,   0,
19 		0,  16,  19,   4, -65, -65, -71, -71,
20 		0,   0,   0,   0,   0,   0,   0,   0);
21 
22 	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
23 
24 	// Load input:
25 	__m256i str = _mm256_loadu_si256((__m256i *) *s);
26 
27 	// See the SSSE3 decoder for an explanation of the algorithm.
28 	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
29 	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
30 	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
31 	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
32 
33 	if (!_mm256_testz_si256(lo, hi)) {
34 		return 0;
35 	}
36 
37 	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
38 	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
39 
40 	// Now simply add the delta values to the input:
41 	str = _mm256_add_epi8(str, roll);
42 
43 	// Reshuffle the input to packed 12-byte output format:
44 	str = dec_reshuffle(str);
45 
46 	// Store the output:
47 	_mm256_storeu_si256((__m256i *) *o, str);
48 
49 	*s += 32;
50 	*o += 24;
51 	*rounds -= 1;
52 
53 	return 1;
54 }
55 
56 static inline void
dec_loop_avx2(const uint8_t ** s,size_t * slen,uint8_t ** o,size_t * olen)57 dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
58 {
59 	if (*slen < 45) {
60 		return;
61 	}
62 
63 	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
64 	// written after the output, ensure that there will be at least 13
65 	// bytes of input data left to cover the gap. (11 data bytes and up to
66 	// two end-of-string markers.)
67 	size_t rounds = (*slen - 13) / 32;
68 
69 	*slen -= rounds * 32;	// 32 bytes consumed per round
70 	*olen += rounds * 24;	// 24 bytes produced per round
71 
72 	do {
73 		if (rounds >= 8) {
74 			if (dec_loop_avx2_inner(s, o, &rounds) &&
75 			    dec_loop_avx2_inner(s, o, &rounds) &&
76 			    dec_loop_avx2_inner(s, o, &rounds) &&
77 			    dec_loop_avx2_inner(s, o, &rounds) &&
78 			    dec_loop_avx2_inner(s, o, &rounds) &&
79 			    dec_loop_avx2_inner(s, o, &rounds) &&
80 			    dec_loop_avx2_inner(s, o, &rounds) &&
81 			    dec_loop_avx2_inner(s, o, &rounds)) {
82 				continue;
83 			}
84 			break;
85 		}
86 		if (rounds >= 4) {
87 			if (dec_loop_avx2_inner(s, o, &rounds) &&
88 			    dec_loop_avx2_inner(s, o, &rounds) &&
89 			    dec_loop_avx2_inner(s, o, &rounds) &&
90 			    dec_loop_avx2_inner(s, o, &rounds)) {
91 				continue;
92 			}
93 			break;
94 		}
95 		if (rounds >= 2) {
96 			if (dec_loop_avx2_inner(s, o, &rounds) &&
97 			    dec_loop_avx2_inner(s, o, &rounds)) {
98 				continue;
99 			}
100 			break;
101 		}
102 		dec_loop_avx2_inner(s, o, &rounds);
103 		break;
104 
105 	} while (rounds > 0);
106 
107 	// Adjust for any rounds that were skipped:
108 	*slen += rounds * 32;
109 	*olen -= rounds * 24;
110 }
111