1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/lib/strings/ordered_code.h"
17
18 #include <assert.h>
19 #include <stddef.h>
20
21 #include "tensorflow/core/lib/core/stringpiece.h"
22 #include "tensorflow/core/platform/logging.h"
23
24 namespace tensorflow {
25 namespace strings {
26
27 // We encode a string in different ways depending on whether the item
28 // should be in lexicographically increasing or decreasing order.
29 //
30 //
31 // Lexicographically increasing order
32 //
33 // We want a string-to-string mapping F(x) such that for any two strings
34 //
35 // x < y => F(x) < F(y)
36 //
37 // In addition to the normal characters '\x00' through '\xff', we want to
38 // encode a few extra symbols in strings:
39 //
40 // <sep> Separator between items
41 // <infinity> Infinite string
42 //
43 // Therefore we need an alphabet with at least 258 symbols. Each
44 // character '\1' through '\xfe' is mapped to itself. The other four are
45 // encoded into two-letter sequences starting with '\0' and '\xff':
46 //
47 // <sep> encoded as => \0\1
48 // \0 encoded as => \0\xff
49 // \xff encoded as => \xff\x00
50 // <infinity> encoded as => \xff\xff
51 //
52 // The remaining two-letter sequences starting with '\0' and '\xff' are
53 // currently unused.
54 //
55 // F(<infinity>) is defined above. For any finite string x, F(x) is the
56 // the encodings of x's characters followed by the encoding for <sep>. The
57 // ordering of two finite strings is the same as the ordering of the
58 // respective characters at the first position where they differ, which in
59 // turn is the same as the ordering of the encodings of those two
60 // characters. Moreover, for every finite string x, F(x) < F(<infinity>).
61 //
62 //
63 // Lexicographically decreasing order
64 //
65 // We want a string-to-string mapping G(x) such that for any two strings,
66 // whether finite or not,
67 //
68 // x < y => G(x) > G(y)
69 //
70 // To achieve this, define G(x) to be the inversion of F(x): I(F(x)). In
71 // other words, invert every bit in F(x) to get G(x). For example,
72 //
73 // x = \x00\x13\xff
74 // F(x) = \x00\xff\x13\xff\x00\x00\x01 escape \0, \xff, append F(<sep>)
75 // G(x) = \xff\x00\xec\x00\xff\xff\xfe invert every bit in F(x)
76 //
77 // x = <infinity>
78 // F(x) = \xff\xff
79 // G(x) = \x00\x00
80 //
81 // Another example is
82 //
83 // x F(x) G(x) = I(F(x))
84 // - ---- --------------
85 // <infinity> \xff\xff \x00\x00
86 // "foo" foo\0\1 \x99\x90\x90\xff\xfe
87 // "aaa" aaa\0\1 \x9e\x9e\x9e\xff\xfe
88 // "aa" aa\0\1 \x9e\x9e\xff\xfe
89 // "" \0\1 \xff\xfe
90 //
91 // More generally and rigorously, if for any two strings x and y
92 //
93 // F(x) < F(y) => I(F(x)) > I(F(y)) (1)
94 //
95 // it would follow that x < y => G(x) > G(y) because
96 //
97 // x < y => F(x) < F(y) => G(x) = I(F(x)) > I(F(y)) = G(y)
98 //
99 // We now show why (1) is true, in two parts. Notice that for any two
100 // strings x < y, F(x) is *not* a proper prefix of F(y). Suppose x is a
101 // proper prefix of y (say, x="abc" < y="abcd"). F(x) and F(y) diverge at
102 // the F(<sep>) in F(x) (v. F('d') in the example). Suppose x is not a
103 // proper prefix of y (say, x="abce" < y="abd"), F(x) and F(y) diverge at
104 // their respective encodings of the characters where x and y diverge
105 // (F('c') v. F('d')). Finally, if y=<infinity>, we can see that
106 // F(y)=\xff\xff is not the prefix of F(x) for any finite string x, simply
107 // by considering all the possible first characters of F(x).
108 //
109 // Given that F(x) is not a proper prefix F(y), the order of F(x) and F(y)
110 // is determined by the byte where F(x) and F(y) diverge. For example, the
111 // order of F(x)="eefh" and F(y)="eeg" is determined by their third
112 // characters. I(p) inverts each byte in p, which effectively subtracts
113 // each byte from 0xff. So, in this example, I('f') > I('g'), and thus
114 // I(F(x)) > I(F(y)).
115 //
116 //
117 // Implementation
118 //
119 // To implement G(x) efficiently, we use C++ template to instantiate two
120 // versions of the code to produce F(x), one for normal encoding (giving us
121 // F(x)) and one for inverted encoding (giving us G(x) = I(F(x))).
122
123 static const char kEscape1 = '\000';
124 static const char kNullCharacter = '\xff'; // Combined with kEscape1
125 static const char kSeparator = '\001'; // Combined with kEscape1
126
127 static const char kEscape2 = '\xff';
128 static const char kFFCharacter = '\000'; // Combined with kEscape2
129
130 static const char kEscape1_Separator[2] = {kEscape1, kSeparator};
131
132 // Append to "*dest" the "len" bytes starting from "*src".
AppendBytes(string * dest,const char * src,size_t len)133 inline static void AppendBytes(string* dest, const char* src, size_t len) {
134 dest->append(src, len);
135 }
136
IsSpecialByte(char c)137 inline bool IsSpecialByte(char c) {
138 return (static_cast<unsigned char>(c + 1)) < 2;
139 }
140
141 // Return a pointer to the first byte in the range "[start..limit)"
142 // whose value is 0 or 255 (kEscape1 or kEscape2). If no such byte
143 // exists in the range, returns "limit".
SkipToNextSpecialByte(const char * start,const char * limit)144 inline const char* SkipToNextSpecialByte(const char* start, const char* limit) {
145 // If these constants were ever changed, this routine needs to change
146 DCHECK_EQ(kEscape1, 0);
147 DCHECK_EQ(kEscape2 & 0xffu, 255u);
148 const char* p = start;
149 while (p < limit && !IsSpecialByte(*p)) {
150 p++;
151 }
152 return p;
153 }
154
155 // Expose SkipToNextSpecialByte for testing purposes
TEST_SkipToNextSpecialByte(const char * start,const char * limit)156 const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start,
157 const char* limit) {
158 return SkipToNextSpecialByte(start, limit);
159 }
160
161 // Helper routine to encode "s" and append to "*dest", escaping special
162 // characters.
EncodeStringFragment(string * dest,StringPiece s)163 inline static void EncodeStringFragment(string* dest, StringPiece s) {
164 const char* p = s.data();
165 const char* limit = p + s.size();
166 const char* copy_start = p;
167 while (true) {
168 p = SkipToNextSpecialByte(p, limit);
169 if (p >= limit) break; // No more special characters that need escaping
170 char c = *(p++);
171 DCHECK(IsSpecialByte(c));
172 if (c == kEscape1) {
173 AppendBytes(dest, copy_start, p - copy_start - 1);
174 dest->push_back(kEscape1);
175 dest->push_back(kNullCharacter);
176 copy_start = p;
177 } else {
178 assert(c == kEscape2);
179 AppendBytes(dest, copy_start, p - copy_start - 1);
180 dest->push_back(kEscape2);
181 dest->push_back(kFFCharacter);
182 copy_start = p;
183 }
184 }
185 if (p > copy_start) {
186 AppendBytes(dest, copy_start, p - copy_start);
187 }
188 }
189
WriteString(string * dest,StringPiece s)190 void OrderedCode::WriteString(string* dest, StringPiece s) {
191 EncodeStringFragment(dest, s);
192 AppendBytes(dest, kEscape1_Separator, 2);
193 }
194
WriteNumIncreasing(string * dest,uint64 val)195 void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
196 // Values are encoded with a single byte length prefix, followed
197 // by the actual value in big-endian format with leading 0 bytes
198 // dropped.
199 unsigned char buf[9]; // 8 bytes for value plus one byte for length
200 int len = 0;
201 while (val > 0) {
202 len++;
203 buf[9 - len] = (val & 0xff);
204 val >>= 8;
205 }
206 buf[9 - len - 1] = len;
207 len++;
208 AppendBytes(dest, reinterpret_cast<const char*>(buf + 9 - len), len);
209 }
210
211 // Parse the encoding of a previously encoded string.
212 // If parse succeeds, return true, consume encoding from
213 // "*src", and if result != NULL append the decoded string to "*result".
214 // Otherwise, return false and leave both undefined.
ReadStringInternal(StringPiece * src,string * result)215 inline static bool ReadStringInternal(StringPiece* src, string* result) {
216 const char* start = src->data();
217 const char* string_limit = src->data() + src->size();
218
219 // We only scan up to "limit-2" since a valid string must end with
220 // a two character terminator: 'kEscape1 kSeparator'
221 const char* limit = string_limit - 1;
222 const char* copy_start = start;
223 while (true) {
224 start = SkipToNextSpecialByte(start, limit);
225 if (start >= limit) break; // No terminator sequence found
226 const char c = *(start++);
227 // If inversion is required, instead of inverting 'c', we invert the
228 // character constants to which 'c' is compared. We get the same
229 // behavior but save the runtime cost of inverting 'c'.
230 DCHECK(IsSpecialByte(c));
231 if (c == kEscape1) {
232 if (result) {
233 AppendBytes(result, copy_start, start - copy_start - 1);
234 }
235 // kEscape1 kSeparator ends component
236 // kEscape1 kNullCharacter represents '\0'
237 const char next = *(start++);
238 if (next == kSeparator) {
239 src->remove_prefix(start - src->data());
240 return true;
241 } else if (next == kNullCharacter) {
242 if (result) {
243 *result += '\0';
244 }
245 } else {
246 return false;
247 }
248 copy_start = start;
249 } else {
250 assert(c == kEscape2);
251 if (result) {
252 AppendBytes(result, copy_start, start - copy_start - 1);
253 }
254 // kEscape2 kFFCharacter represents '\xff'
255 // kEscape2 kInfinity is an error
256 const char next = *(start++);
257 if (next == kFFCharacter) {
258 if (result) {
259 *result += '\xff';
260 }
261 } else {
262 return false;
263 }
264 copy_start = start;
265 }
266 }
267 return false;
268 }
269
ReadString(StringPiece * src,string * result)270 bool OrderedCode::ReadString(StringPiece* src, string* result) {
271 return ReadStringInternal(src, result);
272 }
273
ReadNumIncreasing(StringPiece * src,uint64 * result)274 bool OrderedCode::ReadNumIncreasing(StringPiece* src, uint64* result) {
275 if (src->empty()) {
276 return false; // Not enough bytes
277 }
278
279 // Decode length byte
280 const size_t len = static_cast<unsigned char>((*src)[0]);
281
282 // If len > 0 and src is longer than 1, the first byte of "payload"
283 // must be non-zero (otherwise the encoding is not minimal).
284 // In opt mode, we don't enforce that encodings must be minimal.
285 DCHECK(0 == len || src->size() == 1 || (*src)[1] != '\0')
286 << "invalid encoding";
287
288 if (len + 1 > src->size() || len > 8) {
289 return false; // Not enough bytes or too many bytes
290 }
291
292 if (result) {
293 uint64 tmp = 0;
294 for (size_t i = 0; i < len; i++) {
295 tmp <<= 8;
296 tmp |= static_cast<unsigned char>((*src)[1 + i]);
297 }
298 *result = tmp;
299 }
300 src->remove_prefix(len + 1);
301 return true;
302 }
303
TEST_Corrupt(string * str,int k)304 void OrderedCode::TEST_Corrupt(string* str, int k) {
305 int seen_seps = 0;
306 for (size_t i = 0; i + 1 < str->size(); i++) {
307 if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) {
308 seen_seps++;
309 if (seen_seps == k) {
310 (*str)[i + 1] = kSeparator + 1;
311 return;
312 }
313 }
314 }
315 }
316
317 // Signed number encoding/decoding /////////////////////////////////////
318 //
319 // The format is as follows:
320 //
321 // The first bit (the most significant bit of the first byte)
322 // represents the sign, 0 if the number is negative and
323 // 1 if the number is >= 0.
324 //
325 // Any unbroken sequence of successive bits with the same value as the sign
326 // bit, up to 9 (the 8th and 9th are the most significant bits of the next
327 // byte), are size bits that count the number of bytes after the first byte.
328 // That is, the total length is between 1 and 10 bytes.
329 //
330 // The value occupies the bits after the sign bit and the "size bits"
331 // till the end of the string, in network byte order. If the number
332 // is negative, the bits are in 2-complement.
333 //
334 //
335 // Example 1: number 0x424242 -> 4 byte big-endian hex string 0xf0424242:
336 //
337 // +---------------+---------------+---------------+---------------+
338 // 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0
339 // +---------------+---------------+---------------+---------------+
340 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
341 // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
342 // | | | | payload: the remaining bits after the sign and size bits
343 // | | | | and the delimiter bit, the value is 0x424242
344 // | | | |
345 // | size bits: 3 successive bits with the same value as the sign bit
346 // | (followed by a delimiter bit with the opposite value)
347 // | mean that there are 3 bytes after the first byte, 4 total
348 // |
349 // sign bit: 1 means that the number is non-negative
350 //
351 // Example 2: negative number -0x800 -> 2 byte big-endian hex string 0x3800:
352 //
353 // +---------------+---------------+
354 // 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
355 // +---------------+---------------+
356 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
357 // | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
358 // | | payload: the remaining bits after the sign and size bits and the
359 // | | delimiter bit, 2-complement because of the negative sign,
360 // | | value is ~0x7ff, represents the value -0x800
361 // | |
362 // | size bits: 1 bit with the same value as the sign bit
363 // | (followed by a delimiter bit with the opposite value)
364 // | means that there is 1 byte after the first byte, 2 total
365 // |
366 // sign bit: 0 means that the number is negative
367 //
368 //
369 // Compared with the simpler unsigned format used for uint64 numbers,
370 // this format is more compact for small numbers, namely one byte encodes
371 // numbers in the range [-64,64), two bytes cover the range [-2^13,2^13), etc.
372 // In general, n bytes encode numbers in the range [-2^(n*7-1),2^(n*7-1)).
373 // (The cross-over point for compactness of representation is 8 bytes,
374 // where this format only covers the range [-2^55,2^55),
375 // whereas an encoding with sign bit and length in the first byte and
376 // payload in all following bytes would cover [-2^56,2^56).)
377
378 static const int kMaxSigned64Length = 10;
379
380 // This array maps encoding length to header bits in the first two bytes.
381 static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = {
382 {0, 0}, {'\x80', 0}, {'\xc0', 0}, {'\xe0', 0},
383 {'\xf0', 0}, {'\xf8', 0}, {'\xfc', 0}, {'\xfe', 0},
384 {'\xff', 0}, {'\xff', '\x80'}, {'\xff', '\xc0'}};
385
386 // This array maps encoding lengths to the header bits that overlap with
387 // the payload and need fixing when reading.
388 static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
389 0ULL,
390 0x80ULL,
391 0xc000ULL,
392 0xe00000ULL,
393 0xf0000000ULL,
394 0xf800000000ULL,
395 0xfc0000000000ULL,
396 0xfe000000000000ULL,
397 0xff00000000000000ULL,
398 0x8000000000000000ULL,
399 0ULL};
400
401 // This array maps the number of bits in a number to the encoding
402 // length produced by WriteSignedNumIncreasing.
403 // For positive numbers, the number of bits is 1 plus the most significant
404 // bit position (the highest bit position in a positive int64 is 63).
405 // For a negative number n, we count the bits in ~n.
406 // That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1].
407 static const int8 kBitsToLength[1 + 63] = {
408 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
409 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
410 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
411
412 #if defined(__GNUC__)
413 // Returns floor(lg(n)). Returns -1 if n == 0.
Log2Floor64(uint64 n)414 static int Log2Floor64(uint64 n) {
415 return n == 0 ? -1 : 63 ^ __builtin_clzll(n);
416 }
417 #else
418 // Portable slow version
Log2Floor32_Portable(uint32 n)419 static int Log2Floor32_Portable(uint32 n) {
420 if (n == 0) return -1;
421 int log = 0;
422 uint32 value = n;
423 for (int i = 4; i >= 0; --i) {
424 int shift = (1 << i);
425 uint32 x = value >> shift;
426 if (x != 0) {
427 value = x;
428 log += shift;
429 }
430 }
431 assert(value == 1);
432 return log;
433 }
434 // Returns floor(lg(n)). Returns -1 if n == 0.
Log2Floor64(uint64 n)435 static int Log2Floor64(uint64 n) {
436 const uint32 topbits = static_cast<uint32>(n >> 32);
437 if (topbits == 0) {
438 // Top bits are zero, so scan in bottom bits
439 return Log2Floor32_Portable(static_cast<uint32>(n));
440 } else {
441 return 32 + Log2Floor32_Portable(topbits);
442 }
443 }
444 #endif
445
446 // Calculates the encoding length in bytes of the signed number n.
SignedEncodingLength(int64 n)447 static inline int SignedEncodingLength(int64 n) {
448 return kBitsToLength[Log2Floor64(n < 0 ? ~n : n) + 1];
449 }
450
StoreBigEndian64(char * dst,uint64 v)451 static void StoreBigEndian64(char* dst, uint64 v) {
452 for (int i = 0; i < 8; i++) {
453 dst[i] = (v >> (56 - 8 * i)) & 0xff;
454 }
455 }
456
LoadBigEndian64(const char * src)457 static uint64 LoadBigEndian64(const char* src) {
458 uint64 result = 0;
459 for (int i = 0; i < 8; i++) {
460 unsigned char c = static_cast<unsigned char>(src[i]);
461 result |= static_cast<uint64>(c) << (56 - 8 * i);
462 }
463 return result;
464 }
465
WriteSignedNumIncreasing(string * dest,int64 val)466 void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
467 const uint64 x = val < 0 ? ~val : val;
468 if (x < 64) { // fast path for encoding length == 1
469 *dest += kLengthToHeaderBits[1][0] ^ val;
470 return;
471 }
472 // buf = val in network byte order, sign extended to 10 bytes
473 const char sign_byte = val < 0 ? '\xff' : '\0';
474 char buf[10] = {
475 sign_byte,
476 sign_byte,
477 };
478 StoreBigEndian64(buf + 2, val);
479 static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
480 const int len = SignedEncodingLength(x);
481 DCHECK_GE(len, 2);
482 char* const begin = buf + sizeof(buf) - len;
483 begin[0] ^= kLengthToHeaderBits[len][0];
484 begin[1] ^= kLengthToHeaderBits[len][1]; // ok because len >= 2
485 dest->append(begin, len);
486 }
487
ReadSignedNumIncreasing(StringPiece * src,int64 * result)488 bool OrderedCode::ReadSignedNumIncreasing(StringPiece* src, int64* result) {
489 if (src->empty()) return false;
490 const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
491 const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff);
492
493 // now calculate and test length, and set x to raw (unmasked) result
494 int len;
495 uint64 x;
496 if (first_byte != 0xff) {
497 len = 7 - Log2Floor64(first_byte ^ 0xff);
498 if (src->size() < static_cast<size_t>(len)) return false;
499 x = xor_mask; // sign extend using xor_mask
500 for (int i = 0; i < len; ++i)
501 x = (x << 8) | static_cast<unsigned char>((*src)[i]);
502 } else {
503 len = 8;
504 if (src->size() < static_cast<size_t>(len)) return false;
505 const unsigned char second_byte = (*src)[1] ^ (xor_mask & 0xff);
506 if (second_byte >= 0x80) {
507 if (second_byte < 0xc0) {
508 len = 9;
509 } else {
510 const unsigned char third_byte = (*src)[2] ^ (xor_mask & 0xff);
511 if (second_byte == 0xc0 && third_byte < 0x80) {
512 len = 10;
513 } else {
514 return false; // either len > 10 or len == 10 and #bits > 63
515 }
516 }
517 if (src->size() < static_cast<size_t>(len)) return false;
518 }
519 x = LoadBigEndian64(src->data() + len - 8);
520 }
521
522 x ^= kLengthToMask[len]; // remove spurious header bits
523
524 DCHECK_EQ(len, SignedEncodingLength(x)) << "invalid encoding";
525
526 if (result) *result = x;
527 src->remove_prefix(len);
528 return true;
529 }
530
531 } // namespace strings
532 } // namespace tensorflow
533