• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // The input consists of six character sets in the Base64 alphabet, which we
2 // need to map back to the 6-bit values they represent. There are three ranges,
3 // two singles, and then there's the rest.
4 //
5 //  #  From       To        Add  Characters
6 //  1  [43]       [62]      +19  +
7 //  2  [47]       [63]      +16  /
8 //  3  [48..57]   [52..61]   +4  0..9
9 //  4  [65..90]   [0..25]   -65  A..Z
10 //  5  [97..122]  [26..51]  -71  a..z
11 // (6) Everything else => invalid input
12 //
13 // We will use lookup tables for character validation and offset computation.
14 // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
15 // allows to mask with 0x2F instead of 0x0F and thus save one constant
16 // declaration (register and/or memory access).
17 //
18 // For offsets:
19 // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
20 // 0000 = garbage
21 // 0001 = /
22 // 0010 = +
23 // 0011 = 0-9
24 // 0100 = A-Z
25 // 0101 = A-Z
26 // 0110 = a-z
27 // 0111 = a-z
28 // 1000 >= garbage
29 //
30 // For validation, here's the table.
31 // A character is valid if and only if the AND of the 2 lookups equals 0:
32 //
33 // hi \ lo              0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
34 //      LUT             0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
35 //
36 // 0000 0x10 char        NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL   BS   HT   LF   VT   FF   CR   SO   SI
37 //           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
38 //
39 // 0001 0x10 char        DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN   EM  SUB  ESC   FS   GS   RS   US
40 //           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
41 //
42 // 0010 0x01 char               !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
43 //           andlut     0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
44 //
45 // 0011 0x02 char          0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
46 //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
47 //
48 // 0100 0x04 char          @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
49 //           andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
50 //
51 // 0101 0x08 char          P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
52 //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
53 //
54 // 0110 0x04 char          `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
55 //           andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
56 // 0111 0x08 char          p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
57 //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
58 //
59 // 1000 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
60 // 1001 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
61 // 1010 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
62 // 1011 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
63 // 1100 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
64 // 1101 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
65 // 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
66 // 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
67 
68 static inline int
dec_loop_ssse3_inner(const uint8_t ** s,uint8_t ** o,size_t * rounds)69 dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
70 {
71 	const __m128i lut_lo = _mm_setr_epi8(
72 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
73 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
74 
75 	const __m128i lut_hi = _mm_setr_epi8(
76 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
77 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
78 
79 	const __m128i lut_roll = _mm_setr_epi8(
80 		0,  16,  19,   4, -65, -65, -71, -71,
81 		0,   0,   0,   0,   0,   0,   0,   0);
82 
83 	const __m128i mask_2F = _mm_set1_epi8(0x2F);
84 
85 	// Load input:
86 	__m128i str = _mm_loadu_si128((__m128i *) *s);
87 
88 	// Table lookups:
89 	const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
90 	const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
91 	const __m128i hi         = _mm_shuffle_epi8(lut_hi, hi_nibbles);
92 	const __m128i lo         = _mm_shuffle_epi8(lut_lo, lo_nibbles);
93 
94 	// Check for invalid input: if any "and" values from lo and hi are not
95 	// zero, fall back on bytewise code to do error checking and reporting:
96 	if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
97 		return 0;
98 	}
99 
100 	const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
101 	const __m128i roll  = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
102 
103 	// Now simply add the delta values to the input:
104 	str = _mm_add_epi8(str, roll);
105 
106 	// Reshuffle the input to packed 12-byte output format:
107 	str = dec_reshuffle(str);
108 
109 	// Store the output:
110 	_mm_storeu_si128((__m128i *) *o, str);
111 
112 	*s += 16;
113 	*o += 12;
114 	*rounds -= 1;
115 
116 	return 1;
117 }
118 
119 static inline void
dec_loop_ssse3(const uint8_t ** s,size_t * slen,uint8_t ** o,size_t * olen)120 dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
121 {
122 	if (*slen < 24) {
123 		return;
124 	}
125 
126 	// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
127 	// written after the output, ensure that there will be at least 8 bytes
128 	// of input data left to cover the gap. (6 data bytes and up to two
129 	// end-of-string markers.)
130 	size_t rounds = (*slen - 8) / 16;
131 
132 	*slen -= rounds * 16;	// 16 bytes consumed per round
133 	*olen += rounds * 12;	// 12 bytes produced per round
134 
135 	do {
136 		if (rounds >= 8) {
137 			if (dec_loop_ssse3_inner(s, o, &rounds) &&
138 			    dec_loop_ssse3_inner(s, o, &rounds) &&
139 			    dec_loop_ssse3_inner(s, o, &rounds) &&
140 			    dec_loop_ssse3_inner(s, o, &rounds) &&
141 			    dec_loop_ssse3_inner(s, o, &rounds) &&
142 			    dec_loop_ssse3_inner(s, o, &rounds) &&
143 			    dec_loop_ssse3_inner(s, o, &rounds) &&
144 			    dec_loop_ssse3_inner(s, o, &rounds)) {
145 				continue;
146 			}
147 			break;
148 		}
149 		if (rounds >= 4) {
150 			if (dec_loop_ssse3_inner(s, o, &rounds) &&
151 			    dec_loop_ssse3_inner(s, o, &rounds) &&
152 			    dec_loop_ssse3_inner(s, o, &rounds) &&
153 			    dec_loop_ssse3_inner(s, o, &rounds)) {
154 				continue;
155 			}
156 			break;
157 		}
158 		if (rounds >= 2) {
159 			if (dec_loop_ssse3_inner(s, o, &rounds) &&
160 			    dec_loop_ssse3_inner(s, o, &rounds)) {
161 				continue;
162 			}
163 			break;
164 		}
165 		dec_loop_ssse3_inner(s, o, &rounds);
166 		break;
167 
168 	} while (rounds > 0);
169 
170 	// Adjust for any rounds that were skipped:
171 	*slen += rounds * 16;
172 	*olen -= rounds * 12;
173 }
174