// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "sync/internal_api/public/base/unique_position.h" #include "base/basictypes.h" #include "base/logging.h" #include "base/rand_util.h" #include "base/stl_util.h" #include "base/strings/string_number_conversions.h" #include "sync/protocol/unique_position.pb.h" #include "third_party/zlib/zlib.h" namespace syncer { const size_t UniquePosition::kSuffixLength = 28; const size_t UniquePosition::kCompressBytesThreshold = 128; // static. bool UniquePosition::IsValidSuffix(const std::string& suffix) { // The suffix must be exactly the specified length, otherwise unique suffixes // are not sufficient to guarantee unique positions (because prefix + suffix // == p + refixsuffix). return suffix.length() == kSuffixLength && suffix[kSuffixLength-1] != 0; } // static. bool UniquePosition::IsValidBytes(const std::string& bytes) { // The first condition ensures that our suffix uniqueness is sufficient to // guarantee position uniqueness. Otherwise, it's possible the end of some // prefix + some short suffix == some long suffix. // The second condition ensures that FindSmallerWithSuffix can always return a // result. return bytes.length() >= kSuffixLength && bytes[bytes.length()-1] != 0; } // static. std::string UniquePosition::RandomSuffix() { // Users random data for all but the last byte. The last byte must not be // zero. We arbitrarily set it to 0x7f. return base::RandBytesAsString(kSuffixLength - 1) + "\x7f"; } // static. UniquePosition UniquePosition::CreateInvalid() { UniquePosition pos; DCHECK(!pos.IsValid()); return pos; } // static. UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) { if (proto.has_custom_compressed_v1()) { return UniquePosition(proto.custom_compressed_v1()); } else if (proto.has_value() && !proto.value().empty()) { return UniquePosition(Compress(proto.value())); } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) { uLongf uncompressed_len = proto.uncompressed_length(); std::string un_gzipped; un_gzipped.resize(uncompressed_len); int result = uncompress( reinterpret_cast(string_as_array(&un_gzipped)), &uncompressed_len, reinterpret_cast(proto.compressed_value().data()), proto.compressed_value().size()); if (result != Z_OK) { DLOG(ERROR) << "Unzip failed " << result; return UniquePosition::CreateInvalid(); } if (uncompressed_len != proto.uncompressed_length()) { DLOG(ERROR) << "Uncompressed length " << uncompressed_len << " did not match specified length " << proto.uncompressed_length(); return UniquePosition::CreateInvalid(); } return UniquePosition(Compress(un_gzipped)); } else { return UniquePosition::CreateInvalid(); } } // static. UniquePosition UniquePosition::FromInt64( int64 x, const std::string& suffix) { uint64 y = static_cast(x); y ^= 0x8000000000000000ULL; // Make it non-negative. std::string bytes(8, 0); for (int i = 7; i >= 0; --i) { bytes[i] = static_cast(y); y >>= 8; } return UniquePosition(bytes + suffix, suffix); } // static. UniquePosition UniquePosition::InitialPosition( const std::string& suffix) { DCHECK(IsValidSuffix(suffix)); return UniquePosition(suffix, suffix); } // static. UniquePosition UniquePosition::Before( const UniquePosition& x, const std::string& suffix) { DCHECK(IsValidSuffix(suffix)); DCHECK(x.IsValid()); const std::string& before = FindSmallerWithSuffix( Uncompress(x.compressed_), suffix); return UniquePosition(before + suffix, suffix); } // static. UniquePosition UniquePosition::After( const UniquePosition& x, const std::string& suffix) { DCHECK(IsValidSuffix(suffix)); DCHECK(x.IsValid()); const std::string& after = FindGreaterWithSuffix( Uncompress(x.compressed_), suffix); return UniquePosition(after + suffix, suffix); } // static. UniquePosition UniquePosition::Between( const UniquePosition& before, const UniquePosition& after, const std::string& suffix) { DCHECK(before.IsValid()); DCHECK(after.IsValid()); DCHECK(before.LessThan(after)); DCHECK(IsValidSuffix(suffix)); const std::string& mid = FindBetweenWithSuffix( Uncompress(before.compressed_), Uncompress(after.compressed_), suffix); return UniquePosition(mid + suffix, suffix); } UniquePosition::UniquePosition() : is_valid_(false) {} bool UniquePosition::LessThan(const UniquePosition& other) const { DCHECK(this->IsValid()); DCHECK(other.IsValid()); return compressed_ < other.compressed_; } bool UniquePosition::Equals(const UniquePosition& other) const { if (!this->IsValid() && !other.IsValid()) return true; return compressed_ == other.compressed_; } void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const { proto->Clear(); // This is the current preferred foramt. proto->set_custom_compressed_v1(compressed_); // Older clients used to write other formats. We don't bother doing that // anymore because that form of backwards compatibility is expensive. We no // longer want to pay that price just too support clients that have been // obsolete for a long time. See the proto definition for details. } void UniquePosition::SerializeToString(std::string* blob) const { DCHECK(blob); sync_pb::UniquePosition proto; ToProto(&proto); proto.SerializeToString(blob); } int64 UniquePosition::ToInt64() const { uint64 y = 0; const std::string& s = Uncompress(compressed_); size_t l = sizeof(int64); if (s.length() < l) { NOTREACHED(); l = s.length(); } for (size_t i = 0; i < l; ++i) { const uint8 byte = s[l - i - 1]; y |= static_cast(byte) << (i * 8); } y ^= 0x8000000000000000ULL; // This is technically implementation-defined if y > INT64_MAX, so // we're assuming that we're on a twos-complement machine. return static_cast(y); } bool UniquePosition::IsValid() const { return is_valid_; } std::string UniquePosition::ToDebugString() const { const std::string bytes = Uncompress(compressed_); if (bytes.empty()) return std::string("INVALID[]"); std::string debug_string = base::HexEncode(bytes.data(), bytes.length()); if (!IsValid()) { debug_string = "INVALID[" + debug_string + "]"; } std::string compressed_string = base::HexEncode(compressed_.data(), compressed_.length()); debug_string.append(", compressed: " + compressed_string); return debug_string; } std::string UniquePosition::GetSuffixForTest() const { const std::string bytes = Uncompress(compressed_); const size_t prefix_len = bytes.length() - kSuffixLength; return bytes.substr(prefix_len, std::string::npos); } std::string UniquePosition::FindSmallerWithSuffix( const std::string& reference, const std::string& suffix) { size_t ref_zeroes = reference.find_first_not_of('\0'); size_t suffix_zeroes = suffix.find_first_not_of('\0'); // Neither of our inputs are allowed to have trailing zeroes, so the following // must be true. DCHECK_NE(ref_zeroes, std::string::npos); DCHECK_NE(suffix_zeroes, std::string::npos); if (suffix_zeroes > ref_zeroes) { // Implies suffix < ref. return std::string(); } if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) { // Prepend zeroes so the result has as many zero digits as |reference|. return std::string(ref_zeroes - suffix_zeroes, '\0'); } else if (suffix_zeroes > 1) { // Prepend zeroes so the result has one more zero digit than |reference|. // We could also take the "else" branch below, but taking this branch will // give us a smaller result. return std::string(ref_zeroes - suffix_zeroes + 1, '\0'); } else { // Prepend zeroes to match those in the |reference|, then something smaller // than the first non-zero digit in |reference|. char lt_digit = static_cast(reference[ref_zeroes])/2; return std::string(ref_zeroes, '\0') + lt_digit; } } // static std::string UniquePosition::FindGreaterWithSuffix( const std::string& reference, const std::string& suffix) { size_t ref_FFs = reference.find_first_not_of(kuint8max); size_t suffix_FFs = suffix.find_first_not_of(kuint8max); if (ref_FFs == std::string::npos) { ref_FFs = reference.length(); } if (suffix_FFs == std::string::npos) { suffix_FFs = suffix.length(); } if (suffix_FFs > ref_FFs) { // Implies suffix > reference. return std::string(); } if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) { // Prepend FF digits to match those in |reference|. return std::string(ref_FFs - suffix_FFs, kuint8max); } else if (suffix_FFs > 1) { // Prepend enough leading FF digits so result has one more of them than // |reference| does. We could also take the "else" branch below, but this // gives us a smaller result. return std::string(ref_FFs - suffix_FFs + 1, kuint8max); } else { // Prepend FF digits to match those in |reference|, then something larger // than the first non-FF digit in |reference|. char gt_digit = static_cast(reference[ref_FFs]) + (kuint8max - static_cast(reference[ref_FFs]) + 1) / 2; return std::string(ref_FFs, kuint8max) + gt_digit; } } // static std::string UniquePosition::FindBetweenWithSuffix( const std::string& before, const std::string& after, const std::string& suffix) { DCHECK(IsValidSuffix(suffix)); DCHECK_NE(before, after); DCHECK_LT(before, after); std::string mid; // Sometimes our suffix puts us where we want to be. if (before < suffix && suffix < after) { return std::string(); } size_t i = 0; for ( ; i < std::min(before.length(), after.length()); ++i) { uint8 a_digit = before[i]; uint8 b_digit = after[i]; if (b_digit - a_digit >= 2) { mid.push_back(a_digit + (b_digit - a_digit)/2); return mid; } else if (a_digit == b_digit) { mid.push_back(a_digit); // Both strings are equal so far. Will appending the suffix at this point // give us the comparison we're looking for? if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) { return mid; } } else { DCHECK_EQ(b_digit - a_digit, 1); // Implied by above if branches. // The two options are off by one digit. The choice of whether to round // up or down here will have consequences on what we do with the remaining // digits. Exploring both options is an optimization and is not required // for the correctness of this algorithm. // Option A: Round down the current digit. This makes our |mid| < // |after|, no matter what we append afterwards. We then focus on // appending digits until |mid| > |before|. std::string mid_a = mid; mid_a.push_back(a_digit); mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix)); // Option B: Round up the current digit. This makes our |mid| > |before|, // no matter what we append afterwards. We then focus on appending digits // until |mid| < |after|. Note that this option may not be viable if the // current digit is the last one in |after|, so we skip the option in that // case. if (after.length() > i+1) { std::string mid_b = mid; mid_b.push_back(b_digit); mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix)); // Does this give us a shorter position value? If so, use it. if (mid_b.length() < mid_a.length()) { return mid_b; } } return mid_a; } } // If we haven't found a midpoint yet, the following must be true: DCHECK_EQ(before.substr(0, i), after.substr(0, i)); DCHECK_EQ(before, mid); DCHECK_LT(before.length(), after.length()); // We know that we'll need to append at least one more byte to |mid| in the // process of making it < |after|. Appending any digit, regardless of the // value, will make |before| < |mid|. Therefore, the following will get us a // valid position. mid.append(FindSmallerWithSuffix(after.substr(i), suffix)); return mid; } UniquePosition::UniquePosition(const std::string& internal_rep) : compressed_(internal_rep), is_valid_(IsValidBytes(Uncompress(internal_rep))) { } UniquePosition::UniquePosition( const std::string& uncompressed, const std::string& suffix) : compressed_(Compress(uncompressed)), is_valid_(IsValidBytes(uncompressed)) { DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length()); DCHECK(IsValidSuffix(suffix)); DCHECK(IsValid()); } // On custom compression: // // Let C(x) be the compression function and U(x) be the uncompression function. // // This compression scheme has a few special properties. For one, it is // order-preserving. For any two valid position strings x and y: // x < y <=> C(x) < C(y) // This allows us keep the position strings compressed as we sort them. // // The compressed format and the decode algorithm: // // The compressed string is a series of blocks, almost all of which are 8 bytes // in length. The only exception is the last block in the compressed string, // which may be a remainder block, which has length no greater than 7. The // full-length blocks are either repeated character blocks or plain data blocks. // All blocks are entirely self-contained. Their decoded values are independent // from that of their neighbours. // // A repeated character block is encoded into eight bytes and represents between // 4 and 2^31 repeated instances of a given character in the unencoded stream. // The encoding consists of a single character repeated four times, followed by // an encoded count. The encoded count is stored as a big-endian 32 bit // integer. There are 2^31 possible count values, and two encodings for each. // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc = // count'. At compression time, the algorithm will choose between the two // encodings based on which of the two will maintain the appropriate sort // ordering (by a process which will be described below). The decompression // algorithm need not concern itself with which encoding was used; it needs only // to decode it. The decoded value of this block is "count" instances of the // character that was repeated four times in the first half of this block. // // A plain data block is encoded into eight bytes and represents exactly eight // bytes of data in the unencoded stream. The plain data block must not begin // with the same character repeated four times. It is allowed to contain such a // four-character sequence, just not at the start of the block. The decoded // value of a plain data block is identical to its encoded value. // // A remainder block has length of at most seven. It is a shorter version of // the plain data block. It occurs only at the end of the encoded stream and // represents exactly as many bytes of unencoded data as its own length. Like a // plain data block, the remainder block never begins with the same character // repeated four times. The decoded value of this block is identical to its // encoded value. // // The encode algorithm: // // From the above description, it can be seen that there may be more than one // way to encode a given input string. The encoder must be careful to choose // the encoding that guarantees sort ordering. // // The rules for the encoder are as follows: // 1. Iterate through the input string and produce output blocks one at a time. // 2. Where possible (ie. where the next four bytes of input consist of the // same character repeated four times), produce a repeated data block of // maximum possible length. // 3. If there is at least 8 bytes of data remaining and it is not possible // to produce a repeated character block, produce a plain data block. // 4. If there are less than 8 bytes of data remaining and it is not possible // to produce a repeated character block, produce a remainder block. // 5. When producing a repeated character block, the count encoding must be // chosen in such a way that the sort ordering is maintained. The choice is // best illustrated by way of example: // // When comparing two strings, the first of which begins with of 8 // instances of the letter 'B' and the second with 10 instances of the // letter 'B', which of the two should compare lower? The result depends // on the 9th character of the first string, since it will be compared // against the 9th 'B' in the second string. If that character is an 'A', // then the first string will compare lower. If it is a 'C', then the // first string will compare higher. // // The key insight is that the comparison value of a repeated character block // depends on the value of the character that follows it. If the character // follows the repeated character has a value greater than the repeated // character itself, then a shorter run length should translate to a higher // comparison value. Therefore, we encode its count using the low encoding. // Similarly, if the following character is lower, we use the high encoding. namespace { // Appends an encoded run length to |output_str|. static void WriteEncodedRunLength(uint32 length, bool high_encoding, std::string* output_str) { CHECK_GE(length, 4U); CHECK_LT(length, 0x80000000); // Step 1: Invert the count, if necessary, to account for the following digit. uint32 encoded_length; if (high_encoding) { encoded_length = 0xffffffff - length; } else { encoded_length = length; } // Step 2: Write it as big-endian so it compares correctly with memcmp(3). output_str->append(1, 0xff & (encoded_length >> 24U)); output_str->append(1, 0xff & (encoded_length >> 16U)); output_str->append(1, 0xff & (encoded_length >> 8U)); output_str->append(1, 0xff & (encoded_length >> 0U)); } // Reads an encoded run length for |str| at position |i|. static uint32 ReadEncodedRunLength(const std::string& str, size_t i) { DCHECK_LE(i + 4, str.length()); // Step 1: Extract the big-endian count. uint32 encoded_length = ((uint8)(str[i+3]) << 0) | ((uint8)(str[i+2]) << 8) | ((uint8)(str[i+1]) << 16) | ((uint8)(str[i+0]) << 24); // Step 2: If this was an inverted count, un-invert it. uint32 length; if (encoded_length & 0x80000000) { length = 0xffffffff - encoded_length; } else { length = encoded_length; } return length; } // A series of four identical chars at the beginning of a block indicates // the beginning of a repeated character block. static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) { return chars[start_index] == chars[start_index+1] && chars[start_index] == chars[start_index+2] && chars[start_index] == chars[start_index+3]; } } // namespace // static // Wraps the CompressImpl function with a bunch of DCHECKs. std::string UniquePosition::Compress(const std::string& str) { DCHECK(IsValidBytes(str)); std::string compressed = CompressImpl(str); DCHECK(IsValidCompressed(compressed)); DCHECK_EQ(str, Uncompress(compressed)); return compressed; } // static // Performs the order preserving run length compression of a given input string. std::string UniquePosition::CompressImpl(const std::string& str) { std::string output; // The compressed length will usually be at least as long as the suffix (28), // since the suffix bytes are mostly random. Most are a few bytes longer; a // small few are tens of bytes longer. Some early tests indicated that // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a // good trade-off, but that has not been confirmed through profiling. output.reserve(48); // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the // length of a string of identical digits starting at i. for (size_t i = 0; i < str.length(); ) { if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) { // Four identical bytes in a row at this position means that we must start // a repeated character block. Begin by outputting those four bytes. output.append(str, i, 4); // Determine the size of the run. const char rep_digit = str[i]; const size_t runs_until = str.find_first_not_of(rep_digit, i+4); // Handle the 'runs until end' special case specially. size_t run_length; bool encode_high; // True if the next byte is greater than |rep_digit|. if (runs_until == std::string::npos) { run_length = str.length() - i; encode_high = false; } else { run_length = runs_until - i; encode_high = static_cast(str[runs_until]) > static_cast(rep_digit); } DCHECK_LT(run_length, static_cast(kint32max)) << "This implementation can't encode run-lengths greater than 2^31."; WriteEncodedRunLength(run_length, encode_high, &output); i += run_length; // Jump forward by the size of the run length. } else { // Output up to eight bytes without any encoding. const size_t len = std::min(static_cast(8), str.length() - i); output.append(str, i, len); i += len; // Jump forward by the amount of input consumed (usually 8). } } return output; } // static // Uncompresses strings that were compresed with UniquePosition::Compress. std::string UniquePosition::Uncompress(const std::string& str) { std::string output; size_t i = 0; // Iterate through the compressed string one block at a time. for (i = 0; i + 8 <= str.length(); i += 8) { if (IsRepeatedCharPrefix(str, i)) { // Found a repeated character block. Expand it. const char rep_digit = str[i]; uint32 length = ReadEncodedRunLength(str, i+4); output.append(length, rep_digit); } else { // Found a regular block. Copy it. output.append(str, i, 8); } } // Copy the remaining bytes that were too small to form a block. output.append(str, i, std::string::npos); return output; } bool UniquePosition::IsValidCompressed(const std::string& str) { for (size_t i = 0; i + 8 <= str.length(); i += 8) { if (IsRepeatedCharPrefix(str, i)) { uint32 count = ReadEncodedRunLength(str, i+4); if (count < 4) { // A repeated character block should at least represent the four // characters that started it. return false; } if (str[i] == str[i+4]) { // Does the next digit after a count match the repeated character? Then // this is not the highest possible count. return false; } } } // We don't bother looking for the existence or checking the validity of // any partial blocks. There's no way they could be invalid anyway. return true; } } // namespace syncer